import sys
import os
import time
import numpy as np
import pandas as pd
import pickle
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from imblearn.over_sampling import RandomOverSampler
from sklearn.feature_selection import f_classif
from itertools import combinations
from cuml.ensemble import RandomForestClassifier as cuRandomForestClassifier
import cupy as cp
import cudf
from cuml.linear_model import LogisticRegression as cuMLLogisticRegression
print('NOVA_HOME is at', os.getenv('NOVA_HOME'))
sys.path.insert(1, os.getenv('NOVA_HOME'))
%load_ext autoreload
%autoreload 2
from src.common.utils import load_config_file
from src.embeddings.embeddings_utils import load_embeddings
from src.analysis.analyzer_multiplex_markers import AnalyzerMultiplexMarkers
from utils import *
NOVA_HOME is at /home/projects/hornsteinlab/Collaboration/NOVA_GAL/NOVA NOVA_HOME: /home/projects/hornsteinlab/Collaboration/NOVA_GAL/NOVA
## Baseline
batches = [1,2,3,7,8,9,10]
accuracies = []
accumulated_cm = None
for test_batches in batches:
test_batches = [test_batches]
train_batches = list(set(batches)-set(test_batches))
X_train, y_train = load_batches(train_batches)
X_test, y_test = load_batches(test_batches)
print('Train dataset')
print('batches', train_batches)
print(np.shape(y_train), np.shape(X_train), np.unique(y_train))
count_labels(y_train)
print('Test dataset')
print('batches', test_batches)
print(np.shape(y_test), np.shape(X_test), np.unique(y_test))
count_labels(y_test)
# Encode with same label encoder
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)
# Optional: balance training set
balance = False
if balance:
ros = RandomOverSampler(random_state=42)
X_train, y_train_enc = ros.fit_resample(X_train, y_train_enc)
# Optional: normalize
norm = False
if norm:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Convert to GPU
X_train = cudf.DataFrame.from_records(X_train)
X_test = cudf.DataFrame.from_records(X_test)
y_train_enc = cudf.Series(y_train_enc)
# Train
print('fit')
clf = cuMLLogisticRegression(verbose=1)
clf.fit(X_train, y_train_enc)
# Predict
print('predict')
y_pred = clf.predict(X_test).to_numpy()
report = classification_report(y_test_enc, y_pred, target_names=le.classes_, output_dict=True)
print(classification_report(y_test_enc, y_pred, target_names=le.classes_))
plot_confusion_matrix(y_test_enc, y_pred, le)
accuracy = report['accuracy']
accuracies.append(accuracy)
# Accumulate confusion matrix
cm = confusion_matrix(y_test_enc, y_pred, labels=np.arange(len(le.classes_)))
if accumulated_cm is None:
accumulated_cm = cm
else:
accumulated_cm += cm
print(np.mean(accuracies), accuracies)
display_labels = [label.replace('_Untreated', '') for label in le.classes_]
disp = ConfusionMatrixDisplay(confusion_matrix=accumulated_cm, display_labels=display_labels)
disp.plot(xticks_rotation=0)
plt.title("Combined Confusion Matrix Across Batches")
plt.tight_layout()
plt.show()
Train dataset
batches [2, 3, 7, 8, 9, 10]
(41469,) (41469, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
Test dataset
batches [1]
(10932,) (10932, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 1222
FUSHomozygous_Untreated: 1245
FUSRevertant_Untreated: 1015
OPTN_Untreated: 2314
TBK1_Untreated: 1876
TDP43_Untreated: 1699
WT_Untreated: 1561
fit
predict
precision recall f1-score support
FUSHeterozygous_Untreated 0.66 0.91 0.76 1222
FUSHomozygous_Untreated 0.87 0.53 0.66 1245
FUSRevertant_Untreated 0.81 0.91 0.86 1015
OPTN_Untreated 0.79 0.51 0.62 2314
TBK1_Untreated 0.12 0.00 0.00 1876
TDP43_Untreated 0.28 0.31 0.29 1699
WT_Untreated 0.28 0.71 0.40 1561
accuracy 0.50 10932
macro avg 0.54 0.55 0.51 10932
weighted avg 0.52 0.50 0.47 10932
Train dataset
batches [1, 3, 7, 8, 9, 10]
(44045,) (44045, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
Test dataset
batches [2]
(8356,) (8356, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 1231
FUSHomozygous_Untreated: 1162
FUSRevertant_Untreated: 800
OPTN_Untreated: 1649
TBK1_Untreated: 1220
TDP43_Untreated: 1508
WT_Untreated: 786
fit
predict
precision recall f1-score support
FUSHeterozygous_Untreated 0.63 0.64 0.63 1231
FUSHomozygous_Untreated 0.61 0.58 0.60 1162
FUSRevertant_Untreated 0.47 0.99 0.64 800
OPTN_Untreated 0.57 0.65 0.61 1649
TBK1_Untreated 0.64 0.61 0.63 1220
TDP43_Untreated 0.39 0.15 0.22 1508
WT_Untreated 0.61 0.55 0.58 786
accuracy 0.57 8356
macro avg 0.56 0.60 0.56 8356
weighted avg 0.56 0.57 0.54 8356
Train dataset
batches [1, 2, 7, 8, 9, 10]
(45470,) (45470, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
Test dataset
batches [3]
(6931,) (6931, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 1004
FUSHomozygous_Untreated: 800
FUSRevertant_Untreated: 1131
OPTN_Untreated: 1103
TBK1_Untreated: 1045
TDP43_Untreated: 930
WT_Untreated: 918
fit
predict
precision recall f1-score support
FUSHeterozygous_Untreated 0.74 0.97 0.84 1004
FUSHomozygous_Untreated 0.94 0.57 0.71 800
FUSRevertant_Untreated 0.70 0.73 0.71 1131
OPTN_Untreated 0.48 0.40 0.43 1103
TBK1_Untreated 0.42 0.82 0.56 1045
TDP43_Untreated 0.49 0.38 0.43 930
WT_Untreated 0.57 0.17 0.26 918
accuracy 0.59 6931
macro avg 0.62 0.58 0.56 6931
weighted avg 0.61 0.59 0.57 6931
Train dataset batches [1, 2, 3, 8, 9, 10] (52238,) (52238, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated' 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated' 'TDP43_Untreated' 'WT_Untreated'] FUSHeterozygous_Untreated: 6254 FUSHomozygous_Untreated: 6054 FUSRevertant_Untreated: 5452 OPTN_Untreated: 9844 TBK1_Untreated: 6068 TDP43_Untreated: 9291 WT_Untreated: 9275 Test dataset batches [7] (163,) (163, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated' 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated' 'TDP43_Untreated' 'WT_Untreated'] FUSHeterozygous_Untreated: 26 FUSHomozygous_Untreated: 25 FUSRevertant_Untreated: 44 OPTN_Untreated: 5 TBK1_Untreated: 13 TDP43_Untreated: 13 WT_Untreated: 37 fit predict
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
precision recall f1-score support
FUSHeterozygous_Untreated 0.00 0.00 0.00 26
FUSHomozygous_Untreated 0.00 0.00 0.00 25
FUSRevertant_Untreated 0.00 0.00 0.00 44
OPTN_Untreated 0.07 0.80 0.12 5
TBK1_Untreated 0.50 0.08 0.13 13
TDP43_Untreated 0.07 0.23 0.11 13
WT_Untreated 0.61 0.97 0.75 37
accuracy 0.27 163
macro avg 0.18 0.30 0.16 163
weighted avg 0.19 0.27 0.19 163
Train dataset
batches [1, 2, 3, 7, 9, 10]
(42851,) (42851, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
Test dataset
batches [8]
(9550,) (9550, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 1567
FUSHomozygous_Untreated: 1562
FUSRevertant_Untreated: 1163
OPTN_Untreated: 1429
TBK1_Untreated: 755
TDP43_Untreated: 1564
WT_Untreated: 1510
fit
predict
precision recall f1-score support
FUSHeterozygous_Untreated 0.81 0.19 0.31 1567
FUSHomozygous_Untreated 0.54 0.95 0.69 1562
FUSRevertant_Untreated 0.65 0.44 0.53 1163
OPTN_Untreated 0.43 0.24 0.31 1429
TBK1_Untreated 0.73 0.14 0.24 755
TDP43_Untreated 0.29 0.64 0.40 1564
WT_Untreated 0.43 0.36 0.39 1510
accuracy 0.45 9550
macro avg 0.55 0.42 0.41 9550
weighted avg 0.54 0.45 0.42 9550
Train dataset
batches [1, 2, 3, 7, 8, 10]
(43208,) (43208, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
Test dataset
batches [9]
(9193,) (9193, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 963
FUSHomozygous_Untreated: 619
FUSRevertant_Untreated: 1298
OPTN_Untreated: 1586
TBK1_Untreated: 984
TDP43_Untreated: 1439
WT_Untreated: 2304
fit
predict
precision recall f1-score support
FUSHeterozygous_Untreated 0.19 0.27 0.22 963
FUSHomozygous_Untreated 0.28 0.54 0.37 619
FUSRevertant_Untreated 0.80 0.11 0.19 1298
OPTN_Untreated 0.18 0.43 0.26 1586
TBK1_Untreated 0.34 0.24 0.28 984
TDP43_Untreated 0.10 0.09 0.09 1439
WT_Untreated 0.94 0.26 0.41 2304
accuracy 0.26 9193
macro avg 0.40 0.28 0.26 9193
weighted avg 0.47 0.26 0.27 9193
Train dataset
batches [1, 2, 3, 7, 8, 9]
(45125,) (45125, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
Test dataset
batches [10]
(7276,) (7276, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 267
FUSHomozygous_Untreated: 666
FUSRevertant_Untreated: 45
OPTN_Untreated: 1763
TBK1_Untreated: 188
TDP43_Untreated: 2151
WT_Untreated: 2196
fit
predict
precision recall f1-score support
FUSHeterozygous_Untreated 0.58 0.21 0.31 267
FUSHomozygous_Untreated 0.75 0.94 0.84 666
FUSRevertant_Untreated 0.05 0.87 0.09 45
OPTN_Untreated 0.96 0.03 0.05 1763
TBK1_Untreated 0.28 0.45 0.34 188
TDP43_Untreated 0.09 0.01 0.01 2151
WT_Untreated 0.43 0.98 0.60 2196
accuracy 0.42 7276
macro avg 0.45 0.50 0.32 7276
weighted avg 0.49 0.42 0.29 7276
0.43597069158017904 [0.5032016099524332, 0.5671373863092388, 0.5864954552012697, 0.26993865030674846, 0.4500523560209424, 0.2592189709561623, 0.4157504123144585]
## L2 norm => doesnt improve
batches = [1,2,3,7,8,9,10]
accuracies = []
accumulated_cm = None
for test_batches in batches:
test_batches = [test_batches]
train_batches = list(set(batches)-set(test_batches))
X_train, y_train = load_batches(train_batches)
X_test, y_test = load_batches(test_batches)
# L2 normalize each sample
X_train /= np.linalg.norm(X_train, axis=1, keepdims=True)
X_test /= np.linalg.norm(X_test, axis=1, keepdims=True)
print('Train dataset')
print('batches', train_batches)
print(np.shape(y_train), np.shape(X_train), np.unique(y_train))
count_labels(y_train)
print('Test dataset')
print('batches', test_batches)
print(np.shape(y_test), np.shape(X_test), np.unique(y_test))
count_labels(y_test)
# Encode with same label encoder
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)
# Optional: balance training set
balance = False
if balance:
ros = RandomOverSampler(random_state=42)
X_train, y_train_enc = ros.fit_resample(X_train, y_train_enc)
# Optional: normalize
norm = False
if norm:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Convert to GPU
X_train = cudf.DataFrame.from_records(X_train)
X_test = cudf.DataFrame.from_records(X_test)
y_train_enc = cudf.Series(y_train_enc)
# Train
print('fit')
clf = cuMLLogisticRegression(verbose=1)
clf.fit(X_train, y_train_enc)
# Predict
print('predict')
y_pred = clf.predict(X_test).to_numpy()
report = classification_report(y_test_enc, y_pred, target_names=le.classes_, output_dict=True)
print(classification_report(y_test_enc, y_pred, target_names=le.classes_))
plot_confusion_matrix(y_test_enc, y_pred, le)
accuracy = report['accuracy']
accuracies.append(accuracy)
# Accumulate confusion matrix
cm = confusion_matrix(y_test_enc, y_pred, labels=np.arange(len(le.classes_)))
if accumulated_cm is None:
accumulated_cm = cm
else:
accumulated_cm += cm
print(np.mean(accuracies), accuracies)
display_labels = [label.replace('_Untreated', '') for label in le.classes_]
disp = ConfusionMatrixDisplay(confusion_matrix=accumulated_cm, display_labels=display_labels)
disp.plot(xticks_rotation=45)
plt.title("Combined Confusion Matrix Across Batches")
plt.tight_layout()
plt.show()
Train dataset
batches [2, 3, 7, 8, 9, 10]
(41469,) (41469, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
Test dataset
batches [1]
(10932,) (10932, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 1222
FUSHomozygous_Untreated: 1245
FUSRevertant_Untreated: 1015
OPTN_Untreated: 2314
TBK1_Untreated: 1876
TDP43_Untreated: 1699
WT_Untreated: 1561
fit
predict
precision recall f1-score support
FUSHeterozygous_Untreated 0.60 0.93 0.73 1222
FUSHomozygous_Untreated 0.86 0.39 0.54 1245
FUSRevertant_Untreated 0.77 0.88 0.82 1015
OPTN_Untreated 0.71 0.48 0.57 2314
TBK1_Untreated 0.11 0.00 0.01 1876
TDP43_Untreated 0.28 0.24 0.26 1699
WT_Untreated 0.22 0.60 0.32 1561
accuracy 0.46 10932
macro avg 0.51 0.50 0.46 10932
weighted avg 0.48 0.46 0.43 10932
Train dataset
batches [1, 3, 7, 8, 9, 10]
(44045,) (44045, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
Test dataset
batches [2]
(8356,) (8356, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 1231
FUSHomozygous_Untreated: 1162
FUSRevertant_Untreated: 800
OPTN_Untreated: 1649
TBK1_Untreated: 1220
TDP43_Untreated: 1508
WT_Untreated: 786
fit
predict
precision recall f1-score support
FUSHeterozygous_Untreated 0.61 0.59 0.60 1231
FUSHomozygous_Untreated 0.60 0.57 0.58 1162
FUSRevertant_Untreated 0.44 0.98 0.60 800
OPTN_Untreated 0.54 0.68 0.60 1649
TBK1_Untreated 0.59 0.56 0.57 1220
TDP43_Untreated 0.39 0.11 0.17 1508
WT_Untreated 0.62 0.47 0.54 786
accuracy 0.54 8356
macro avg 0.54 0.57 0.52 8356
weighted avg 0.53 0.54 0.51 8356
Train dataset
batches [1, 2, 7, 8, 9, 10]
(45470,) (45470, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
Test dataset
batches [3]
(6931,) (6931, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 1004
FUSHomozygous_Untreated: 800
FUSRevertant_Untreated: 1131
OPTN_Untreated: 1103
TBK1_Untreated: 1045
TDP43_Untreated: 930
WT_Untreated: 918
fit
predict
precision recall f1-score support
FUSHeterozygous_Untreated 0.73 0.95 0.82 1004
FUSHomozygous_Untreated 0.91 0.56 0.69 800
FUSRevertant_Untreated 0.69 0.68 0.68 1131
OPTN_Untreated 0.41 0.41 0.41 1103
TBK1_Untreated 0.39 0.80 0.53 1045
TDP43_Untreated 0.49 0.34 0.40 930
WT_Untreated 0.41 0.07 0.12 918
accuracy 0.55 6931
macro avg 0.58 0.54 0.52 6931
weighted avg 0.57 0.55 0.52 6931
Train dataset batches [1, 2, 3, 8, 9, 10] (52238,) (52238, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated' 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated' 'TDP43_Untreated' 'WT_Untreated'] FUSHeterozygous_Untreated: 6254 FUSHomozygous_Untreated: 6054 FUSRevertant_Untreated: 5452 OPTN_Untreated: 9844 TBK1_Untreated: 6068 TDP43_Untreated: 9291 WT_Untreated: 9275 Test dataset batches [7] (163,) (163, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated' 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated' 'TDP43_Untreated' 'WT_Untreated'] FUSHeterozygous_Untreated: 26 FUSHomozygous_Untreated: 25 FUSRevertant_Untreated: 44 OPTN_Untreated: 5 TBK1_Untreated: 13 TDP43_Untreated: 13 WT_Untreated: 37 fit predict
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
precision recall f1-score support
FUSHeterozygous_Untreated 0.00 0.00 0.00 26
FUSHomozygous_Untreated 0.00 0.00 0.00 25
FUSRevertant_Untreated 0.00 0.00 0.00 44
OPTN_Untreated 0.06 1.00 0.11 5
TBK1_Untreated 1.00 0.08 0.14 13
TDP43_Untreated 0.05 0.15 0.08 13
WT_Untreated 0.76 0.86 0.81 37
accuracy 0.25 163
macro avg 0.27 0.30 0.16 163
weighted avg 0.26 0.25 0.21 163
Train dataset
batches [1, 2, 3, 7, 9, 10]
(42851,) (42851, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
Test dataset
batches [8]
(9550,) (9550, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 1567
FUSHomozygous_Untreated: 1562
FUSRevertant_Untreated: 1163
OPTN_Untreated: 1429
TBK1_Untreated: 755
TDP43_Untreated: 1564
WT_Untreated: 1510
fit
predict
precision recall f1-score support
FUSHeterozygous_Untreated 0.81 0.13 0.22 1567
FUSHomozygous_Untreated 0.53 0.95 0.68 1562
FUSRevertant_Untreated 0.60 0.21 0.31 1163
OPTN_Untreated 0.45 0.24 0.32 1429
TBK1_Untreated 0.72 0.12 0.21 755
TDP43_Untreated 0.26 0.65 0.37 1564
WT_Untreated 0.38 0.32 0.35 1510
accuracy 0.41 9550
macro avg 0.54 0.37 0.35 9550
weighted avg 0.52 0.41 0.36 9550
Train dataset
batches [1, 2, 3, 7, 8, 10]
(43208,) (43208, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
Test dataset
batches [9]
(9193,) (9193, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 963
FUSHomozygous_Untreated: 619
FUSRevertant_Untreated: 1298
OPTN_Untreated: 1586
TBK1_Untreated: 984
TDP43_Untreated: 1439
WT_Untreated: 2304
fit
predict
precision recall f1-score support
FUSHeterozygous_Untreated 0.13 0.16 0.14 963
FUSHomozygous_Untreated 0.29 0.63 0.40 619
FUSRevertant_Untreated 0.59 0.03 0.05 1298
OPTN_Untreated 0.18 0.50 0.26 1586
TBK1_Untreated 0.30 0.14 0.19 984
TDP43_Untreated 0.08 0.06 0.07 1439
WT_Untreated 0.88 0.20 0.32 2304
accuracy 0.22 9193
macro avg 0.35 0.24 0.21 9193
weighted avg 0.41 0.22 0.21 9193
Train dataset
batches [1, 2, 3, 7, 8, 9]
(45125,) (45125, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
Test dataset
batches [10]
(7276,) (7276, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 267
FUSHomozygous_Untreated: 666
FUSRevertant_Untreated: 45
OPTN_Untreated: 1763
TBK1_Untreated: 188
TDP43_Untreated: 2151
WT_Untreated: 2196
fit
predict
precision recall f1-score support
FUSHeterozygous_Untreated 0.60 0.34 0.43 267
FUSHomozygous_Untreated 0.78 0.91 0.84 666
FUSRevertant_Untreated 0.05 0.96 0.10 45
OPTN_Untreated 0.96 0.04 0.07 1763
TBK1_Untreated 0.30 0.33 0.31 188
TDP43_Untreated 0.06 0.00 0.01 2151
WT_Untreated 0.42 0.98 0.59 2196
accuracy 0.42 7276
macro avg 0.45 0.51 0.33 7276
weighted avg 0.48 0.42 0.30 7276
0.4052199180119422 [0.4556348335162825, 0.5391335567257061, 0.55201269658058, 0.24539877300613497, 0.40523560209424087, 0.22364842815185468, 0.41547553600879605]
all_probs = []
batches = [1,2,3,8,9,10]
for test_batch in batches:
test_batches = [test_batch]
train_batches = list(set(batches) - set(test_batches))
X_train, y_train = load_batches(train_batches)
X_test, y_test = load_batches(test_batches)
print('Train dataset')
print('batches', train_batches)
print(np.shape(y_train), np.shape(X_train), np.unique(y_train))
count_labels(y_train)
print('Test dataset')
print('batches', test_batches)
print(np.shape(y_test), np.shape(X_test), np.unique(y_test))
count_labels(y_test)
# Encode
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)
class_names = le.classes_
# Optional: balance
balance = False
if balance:
ros = RandomOverSampler(random_state=42)
X_train, y_train_enc = ros.fit_resample(X_train, y_train_enc)
# Optional: normalize
norm = False
if norm:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# To GPU
X_train = cudf.DataFrame.from_records(X_train)
X_test = cudf.DataFrame.from_records(X_test)
y_train_enc = cudf.Series(y_train_enc)
probs_per_batch = {}
for class_index, class_name in enumerate(class_names):
y_binary = (y_train_enc == class_index).astype(int)
y_binary = cudf.Series(y_binary)
clf = cuMLLogisticRegression(verbose=0)
clf.fit(X_train, y_binary)
probas = clf.predict_proba(X_test).to_numpy()[:, 1]
probs_per_batch[class_name] = probas
df_probs = pd.DataFrame(probs_per_batch)
df_probs["true_label"] = [class_names[i] for i in y_test_enc]
df_probs["test_batch"] = test_batch
all_probs.append(df_probs)
results_df = pd.concat(all_probs, ignore_index=True)
Train dataset batches [2, 3, 8, 9, 10] (41306,) (41306, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated' 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated' 'TDP43_Untreated' 'WT_Untreated'] FUSHeterozygous_Untreated: 5032 FUSHomozygous_Untreated: 4809 FUSRevertant_Untreated: 4437 OPTN_Untreated: 7530 TBK1_Untreated: 4192 TDP43_Untreated: 7592 WT_Untreated: 7714 Test dataset batches [1] (10932,) (10932, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated' 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated' 'TDP43_Untreated' 'WT_Untreated'] FUSHeterozygous_Untreated: 1222 FUSHomozygous_Untreated: 1245 FUSRevertant_Untreated: 1015 OPTN_Untreated: 2314 TBK1_Untreated: 1876 TDP43_Untreated: 1699 WT_Untreated: 1561 Train dataset batches [1, 3, 8, 9, 10] (43882,) (43882, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated' 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated' 'TDP43_Untreated' 'WT_Untreated'] FUSHeterozygous_Untreated: 5023 FUSHomozygous_Untreated: 4892 FUSRevertant_Untreated: 4652 OPTN_Untreated: 8195 TBK1_Untreated: 4848 TDP43_Untreated: 7783 WT_Untreated: 8489 Test dataset batches [2] (8356,) (8356, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated' 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated' 'TDP43_Untreated' 'WT_Untreated'] FUSHeterozygous_Untreated: 1231 FUSHomozygous_Untreated: 1162 FUSRevertant_Untreated: 800 OPTN_Untreated: 1649 TBK1_Untreated: 1220 TDP43_Untreated: 1508 WT_Untreated: 786 Train dataset batches [1, 2, 8, 9, 10] (45307,) (45307, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated' 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated' 'TDP43_Untreated' 'WT_Untreated'] FUSHeterozygous_Untreated: 5250 FUSHomozygous_Untreated: 5254 FUSRevertant_Untreated: 4321 OPTN_Untreated: 8741 TBK1_Untreated: 5023 TDP43_Untreated: 8361 WT_Untreated: 8357 Test dataset batches [3] (6931,) (6931, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated' 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated' 'TDP43_Untreated' 'WT_Untreated'] FUSHeterozygous_Untreated: 1004 FUSHomozygous_Untreated: 800 FUSRevertant_Untreated: 1131 OPTN_Untreated: 1103 TBK1_Untreated: 1045 TDP43_Untreated: 930 WT_Untreated: 918 Train dataset batches [1, 2, 3, 9, 10] (42688,) (42688, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated' 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated' 'TDP43_Untreated' 'WT_Untreated'] FUSHeterozygous_Untreated: 4687 FUSHomozygous_Untreated: 4492 FUSRevertant_Untreated: 4289 OPTN_Untreated: 8415 TBK1_Untreated: 5313 TDP43_Untreated: 7727 WT_Untreated: 7765 Test dataset batches [8] (9550,) (9550, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated' 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated' 'TDP43_Untreated' 'WT_Untreated'] FUSHeterozygous_Untreated: 1567 FUSHomozygous_Untreated: 1562 FUSRevertant_Untreated: 1163 OPTN_Untreated: 1429 TBK1_Untreated: 755 TDP43_Untreated: 1564 WT_Untreated: 1510 Train dataset batches [1, 2, 3, 8, 10] (43045,) (43045, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated' 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated' 'TDP43_Untreated' 'WT_Untreated'] FUSHeterozygous_Untreated: 5291 FUSHomozygous_Untreated: 5435 FUSRevertant_Untreated: 4154 OPTN_Untreated: 8258 TBK1_Untreated: 5084 TDP43_Untreated: 7852 WT_Untreated: 6971 Test dataset batches [9] (9193,) (9193, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated' 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated' 'TDP43_Untreated' 'WT_Untreated'] FUSHeterozygous_Untreated: 963 FUSHomozygous_Untreated: 619 FUSRevertant_Untreated: 1298 OPTN_Untreated: 1586 TBK1_Untreated: 984 TDP43_Untreated: 1439 WT_Untreated: 2304 Train dataset batches [1, 2, 3, 8, 9] (44962,) (44962, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated' 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated' 'TDP43_Untreated' 'WT_Untreated'] FUSHeterozygous_Untreated: 5987 FUSHomozygous_Untreated: 5388 FUSRevertant_Untreated: 5407 OPTN_Untreated: 8081 TBK1_Untreated: 5880 TDP43_Untreated: 7140 WT_Untreated: 7079 Test dataset batches [10] (7276,) (7276, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated' 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated' 'TDP43_Untreated' 'WT_Untreated'] FUSHeterozygous_Untreated: 267 FUSHomozygous_Untreated: 666 FUSRevertant_Untreated: 45 OPTN_Untreated: 1763 TBK1_Untreated: 188 TDP43_Untreated: 2151 WT_Untreated: 2196
# Get the label with max probability per row
results_df["pred_label"] = results_df[class_names].idxmax(axis=1)
cm = confusion_matrix(results_df["true_label"], results_df["pred_label"], labels=class_names)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[l.replace('_Untreated', '') for l in class_names])
disp.plot(xticks_rotation=90)
plt.title("Confusion Matrix from One-vs-Rest Predictions")
plt.tight_layout()
plt.show()
all_probs = []
batches = [1,2,3,8,9,10]
for test_batch in batches:
test_batches = [test_batch]
train_batches = list(set(batches) - set(test_batches))
X_train, y_train = load_batches(train_batches)
X_test, y_test = load_batches(test_batches)
print('Train dataset')
print('batches', train_batches)
print(np.shape(y_train), np.shape(X_train), np.unique(y_train))
count_labels(y_train)
print('Test dataset')
print('batches', test_batches)
print(np.shape(y_test), np.shape(X_test), np.unique(y_test))
count_labels(y_test)
# Encode
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)
class_names = le.classes_
# Optional: normalize
norm = False
if norm:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
probs_per_batch = {}
X_test = cudf.DataFrame.from_records(X_test)
for class_index, class_name in enumerate(class_names):
y_binary = (y_train_enc == class_index).astype(int)
# Optional: balance
balance = True
if balance:
ros = RandomOverSampler(random_state=42)
X_traini, y_binary = ros.fit_resample(X_train, y_binary)
# To GPU
X_traini = cudf.DataFrame.from_records(X_traini)
y_binary = cudf.Series(y_binary)
clf = cuMLLogisticRegression(verbose=0)
clf.fit(X_traini, y_binary)
probas = clf.predict_proba(X_test).to_numpy()[:, 1]
probs_per_batch[class_name] = probas
df_probs = pd.DataFrame(probs_per_batch)
df_probs["true_label"] = [class_names[i] for i in y_test_enc]
df_probs["test_batch"] = test_batch
all_probs.append(df_probs)
results_df_bal = pd.concat(all_probs, ignore_index=True)
Train dataset batches [2, 3, 8, 9, 10] (41306,) (41306, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated' 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated' 'TDP43_Untreated' 'WT_Untreated'] FUSHeterozygous_Untreated: 5032 FUSHomozygous_Untreated: 4809 FUSRevertant_Untreated: 4437 OPTN_Untreated: 7530 TBK1_Untreated: 4192 TDP43_Untreated: 7592 WT_Untreated: 7714 Test dataset batches [1] (10932,) (10932, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated' 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated' 'TDP43_Untreated' 'WT_Untreated'] FUSHeterozygous_Untreated: 1222 FUSHomozygous_Untreated: 1245 FUSRevertant_Untreated: 1015 OPTN_Untreated: 2314 TBK1_Untreated: 1876 TDP43_Untreated: 1699 WT_Untreated: 1561 Train dataset batches [1, 3, 8, 9, 10] (43882,) (43882, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated' 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated' 'TDP43_Untreated' 'WT_Untreated'] FUSHeterozygous_Untreated: 5023 FUSHomozygous_Untreated: 4892 FUSRevertant_Untreated: 4652 OPTN_Untreated: 8195 TBK1_Untreated: 4848 TDP43_Untreated: 7783 WT_Untreated: 8489 Test dataset batches [2] (8356,) (8356, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated' 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated' 'TDP43_Untreated' 'WT_Untreated'] FUSHeterozygous_Untreated: 1231 FUSHomozygous_Untreated: 1162 FUSRevertant_Untreated: 800 OPTN_Untreated: 1649 TBK1_Untreated: 1220 TDP43_Untreated: 1508 WT_Untreated: 786 Train dataset batches [1, 2, 8, 9, 10] (45307,) (45307, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated' 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated' 'TDP43_Untreated' 'WT_Untreated'] FUSHeterozygous_Untreated: 5250 FUSHomozygous_Untreated: 5254 FUSRevertant_Untreated: 4321 OPTN_Untreated: 8741 TBK1_Untreated: 5023 TDP43_Untreated: 8361 WT_Untreated: 8357 Test dataset batches [3] (6931,) (6931, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated' 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated' 'TDP43_Untreated' 'WT_Untreated'] FUSHeterozygous_Untreated: 1004 FUSHomozygous_Untreated: 800 FUSRevertant_Untreated: 1131 OPTN_Untreated: 1103 TBK1_Untreated: 1045 TDP43_Untreated: 930 WT_Untreated: 918 Train dataset batches [1, 2, 3, 9, 10] (42688,) (42688, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated' 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated' 'TDP43_Untreated' 'WT_Untreated'] FUSHeterozygous_Untreated: 4687 FUSHomozygous_Untreated: 4492 FUSRevertant_Untreated: 4289 OPTN_Untreated: 8415 TBK1_Untreated: 5313 TDP43_Untreated: 7727 WT_Untreated: 7765 Test dataset batches [8] (9550,) (9550, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated' 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated' 'TDP43_Untreated' 'WT_Untreated'] FUSHeterozygous_Untreated: 1567 FUSHomozygous_Untreated: 1562 FUSRevertant_Untreated: 1163 OPTN_Untreated: 1429 TBK1_Untreated: 755 TDP43_Untreated: 1564 WT_Untreated: 1510 Train dataset batches [1, 2, 3, 8, 10] (43045,) (43045, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated' 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated' 'TDP43_Untreated' 'WT_Untreated'] FUSHeterozygous_Untreated: 5291 FUSHomozygous_Untreated: 5435 FUSRevertant_Untreated: 4154 OPTN_Untreated: 8258 TBK1_Untreated: 5084 TDP43_Untreated: 7852 WT_Untreated: 6971 Test dataset batches [9] (9193,) (9193, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated' 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated' 'TDP43_Untreated' 'WT_Untreated'] FUSHeterozygous_Untreated: 963 FUSHomozygous_Untreated: 619 FUSRevertant_Untreated: 1298 OPTN_Untreated: 1586 TBK1_Untreated: 984 TDP43_Untreated: 1439 WT_Untreated: 2304 Train dataset batches [1, 2, 3, 8, 9] (44962,) (44962, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated' 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated' 'TDP43_Untreated' 'WT_Untreated'] FUSHeterozygous_Untreated: 5987 FUSHomozygous_Untreated: 5388 FUSRevertant_Untreated: 5407 OPTN_Untreated: 8081 TBK1_Untreated: 5880 TDP43_Untreated: 7140 WT_Untreated: 7079 Test dataset batches [10] (7276,) (7276, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated' 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated' 'TDP43_Untreated' 'WT_Untreated'] FUSHeterozygous_Untreated: 267 FUSHomozygous_Untreated: 666 FUSRevertant_Untreated: 45 OPTN_Untreated: 1763 TBK1_Untreated: 188 TDP43_Untreated: 2151 WT_Untreated: 2196
# Get the label with max probability per row
results_df_bal["pred_label"] = results_df_bal[class_names].idxmax(axis=1)
cm = confusion_matrix(results_df_bal["true_label"], results_df_bal["pred_label"], labels=class_names)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[l.replace('_Untreated', '') for l in class_names])
disp.plot(xticks_rotation=90)
plt.title("Confusion Matrix from One-vs-Rest Predictions")
plt.tight_layout()
plt.show()
batches = [1,2,3,7,8,9,10]
accuracies = []
accumulated_cm = None
# Label mapping
label_map = {
'WT_Untreated': 0,
'TBK1_Untreated': 1,
'OPTN_Untreated': 1,
'FUSHeterozygous_Untreated': 2,
'FUSHomozygous_Untreated': 2,
}
excluded_labels = {'FUSRevertant_Untreated', 'TDP43_Untreated'}
for test_batch in batches:
test_batches = [test_batch]
train_batches = list(set(batches) - set(test_batches))
X_train, y_train = load_batches(train_batches)
X_test, y_test = load_batches(test_batches)
# Filter out unwanted labels
train_mask = ~np.isin(y_train, list(excluded_labels))
test_mask = ~np.isin(y_test, list(excluded_labels))
X_train, y_train = X_train[train_mask], y_train[train_mask]
X_test, y_test = X_test[test_mask], y_test[test_mask]
# Map labels to 0,1,2
y_train_mapped = np.array([label_map[l] for l in y_train])
y_test_mapped = np.array([label_map[l] for l in y_test])
print('Train dataset')
print('batches', train_batches)
print(np.shape(y_train_mapped), np.shape(X_train), np.unique(y_train_mapped))
count_labels(y_train)
print('Test dataset')
print('batches', test_batches)
print(np.shape(y_test_mapped), np.shape(X_test), np.unique(y_test_mapped))
count_labels(y_test)
# Optional: balance
balance = False
if balance:
ros = RandomOverSampler(random_state=42)
X_train, y_train_mapped = ros.fit_resample(X_train, y_train_mapped)
# Optional: normalize
norm = False
if norm:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# To GPU
X_train = cudf.DataFrame.from_records(X_train)
X_test = cudf.DataFrame.from_records(X_test)
y_train_mapped = cudf.Series(y_train_mapped)
# Train
print('fit')
clf = cuMLLogisticRegression(verbose=1)
clf.fit(X_train, y_train_mapped)
# Predict
print('predict')
y_pred = clf.predict(X_test).to_numpy()
report = classification_report(y_test_mapped, y_pred, output_dict=True)
print(classification_report(y_test_mapped, y_pred))
accuracy = report['accuracy']
accuracies.append(accuracy)
# Confusion matrix
cm = confusion_matrix(y_test_mapped, y_pred, labels=[0, 1, 2])
if accumulated_cm is None:
accumulated_cm = cm
else:
accumulated_cm += cm
# Final summary
print(np.mean(accuracies), accuracies)
disp = ConfusionMatrixDisplay(confusion_matrix=accumulated_cm, display_labels=["WT", "TBK1/OPTN", "FUS"])
disp.plot(xticks_rotation=0)
plt.title("Combined Confusion Matrix Across Batches")
plt.tight_layout()
plt.show()
Train dataset
batches [2, 3, 7, 8, 9, 10]
(29383,) (29383, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
OPTN_Untreated: 7535
TBK1_Untreated: 4205
WT_Untreated: 7751
Test dataset
batches [1]
(8218,) (8218, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1222
FUSHomozygous_Untreated: 1245
OPTN_Untreated: 2314
TBK1_Untreated: 1876
WT_Untreated: 1561
fit
predict
precision recall f1-score support
0 0.38 0.96 0.55 1561
1 0.97 0.42 0.59 4190
2 1.00 1.00 1.00 2467
accuracy 0.70 8218
macro avg 0.78 0.80 0.71 8218
weighted avg 0.87 0.70 0.71 8218
Train dataset
batches [1, 3, 7, 8, 9, 10]
(31553,) (31553, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
OPTN_Untreated: 8200
TBK1_Untreated: 4861
WT_Untreated: 8526
Test dataset
batches [2]
(6048,) (6048, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1231
FUSHomozygous_Untreated: 1162
OPTN_Untreated: 1649
TBK1_Untreated: 1220
WT_Untreated: 786
fit
predict
precision recall f1-score support
0 0.88 0.60 0.71 786
1 0.89 0.98 0.93 2869
2 1.00 0.99 1.00 2393
accuracy 0.93 6048
macro avg 0.92 0.86 0.88 6048
weighted avg 0.93 0.93 0.93 6048
Train dataset
batches [1, 2, 7, 8, 9, 10]
(32731,) (32731, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
OPTN_Untreated: 8746
TBK1_Untreated: 5036
WT_Untreated: 8394
Test dataset
batches [3]
(4870,) (4870, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1004
FUSHomozygous_Untreated: 800
OPTN_Untreated: 1103
TBK1_Untreated: 1045
WT_Untreated: 918
fit
predict
precision recall f1-score support
0 0.88 0.20 0.33 918
1 0.74 0.99 0.85 2148
2 1.00 1.00 1.00 1804
accuracy 0.84 4870
macro avg 0.87 0.73 0.72 4870
weighted avg 0.86 0.84 0.81 4870
Train dataset
batches [1, 2, 3, 8, 9, 10]
(37495,) (37495, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
OPTN_Untreated: 9844
TBK1_Untreated: 6068
WT_Untreated: 9275
Test dataset
batches [7]
(106,) (106, 5568) [0 1 2]
FUSHeterozygous_Untreated: 26
FUSHomozygous_Untreated: 25
OPTN_Untreated: 5
TBK1_Untreated: 13
WT_Untreated: 37
fit
predict
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
precision recall f1-score support
0 0.50 0.97 0.66 37
1 0.53 1.00 0.69 18
2 0.00 0.00 0.00 51
accuracy 0.51 106
macro avg 0.34 0.66 0.45 106
weighted avg 0.26 0.51 0.35 106
Train dataset
batches [1, 2, 3, 7, 9, 10]
(30778,) (30778, 5568) [0 1 2]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
OPTN_Untreated: 8420
TBK1_Untreated: 5326
WT_Untreated: 7802
Test dataset
batches [8]
(6823,) (6823, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1567
FUSHomozygous_Untreated: 1562
OPTN_Untreated: 1429
TBK1_Untreated: 755
WT_Untreated: 1510
fit
predict
precision recall f1-score support
0 0.76 0.95 0.85 1510
1 0.96 0.80 0.87 2184
2 1.00 1.00 1.00 3129
accuracy 0.92 6823
macro avg 0.91 0.92 0.91 6823
weighted avg 0.93 0.92 0.92 6823
Train dataset
batches [1, 2, 3, 7, 8, 10]
(31145,) (31145, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
OPTN_Untreated: 8263
TBK1_Untreated: 5097
WT_Untreated: 7008
Test dataset
batches [9]
(6456,) (6456, 5568) [0 1 2]
FUSHeterozygous_Untreated: 963
FUSHomozygous_Untreated: 619
OPTN_Untreated: 1586
TBK1_Untreated: 984
WT_Untreated: 2304
fit
predict
precision recall f1-score support
0 0.99 0.28 0.43 2304
1 0.45 0.52 0.48 2570
2 0.56 1.00 0.72 1582
accuracy 0.55 6456
macro avg 0.67 0.60 0.54 6456
weighted avg 0.67 0.55 0.52 6456
Train dataset
batches [1, 2, 3, 7, 8, 9]
(32521,) (32521, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
OPTN_Untreated: 8086
TBK1_Untreated: 5893
WT_Untreated: 7116
Test dataset
batches [10]
(5080,) (5080, 5568) [0 1 2]
FUSHeterozygous_Untreated: 267
FUSHomozygous_Untreated: 666
OPTN_Untreated: 1763
TBK1_Untreated: 188
WT_Untreated: 2196
fit
predict
precision recall f1-score support
0 0.62 1.00 0.77 2196
1 0.99 0.32 0.49 1951
2 1.00 1.00 1.00 933
accuracy 0.74 5080
macro avg 0.87 0.77 0.75 5080
weighted avg 0.83 0.74 0.70 5080
0.7426534988606892 [0.6994402531029448, 0.93369708994709, 0.8429158110882957, 0.5094339622641509, 0.9233475010992233, 0.5509603469640645, 0.7387795275590551]
## Feature selection =>not as good
batches = [1,2,3,7,8,9,10]
accuracies = []
accumulated_cm = None
# Label mapping
label_map = {
'WT_Untreated': 0,
'TBK1_Untreated': 1,
'OPTN_Untreated': 1,
'FUSHeterozygous_Untreated': 2,
'FUSHomozygous_Untreated': 2,
}
excluded_labels = {'FUSRevertant_Untreated', 'TDP43_Untreated'}
for test_batch in batches:
test_batches = [test_batch]
train_batches = list(set(batches) - set(test_batches))
X_train, y_train = load_batches(train_batches)
X_test, y_test = load_batches(test_batches)
# Filter out unwanted labels
train_mask = ~np.isin(y_train, list(excluded_labels))
test_mask = ~np.isin(y_test, list(excluded_labels))
X_train, y_train = X_train[train_mask], y_train[train_mask]
X_test, y_test = X_test[test_mask], y_test[test_mask]
# Map labels to 0,1,2
y_train_mapped = np.array([label_map[l] for l in y_train])
y_test_mapped = np.array([label_map[l] for l in y_test])
print('Train dataset')
print('batches', train_batches)
print(np.shape(y_train_mapped), np.shape(X_train), np.unique(y_train_mapped))
count_labels(y_train)
print('Test dataset')
print('batches', test_batches)
print(np.shape(y_test_mapped), np.shape(X_test), np.unique(y_test_mapped))
count_labels(y_test)
# Optional: balance
balance = False
if balance:
ros = RandomOverSampler(random_state=42)
X_train, y_train_mapped = ros.fit_resample(X_train, y_train_mapped)
# Optional: normalize
norm = False
if norm:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
top_features = get_top_features(X_train, y_train_mapped, 100)
X_train = X_train[:, top_features]
X_test = X_test[:, top_features]
# To GPU
X_train = cudf.DataFrame.from_records(X_train)
X_test = cudf.DataFrame.from_records(X_test)
y_train_mapped = cudf.Series(y_train_mapped)
# Train
print('fit')
clf = cuMLLogisticRegression(verbose=1)
clf.fit(X_train, y_train_mapped)
# Predict
print('predict')
y_pred = clf.predict(X_test).to_numpy()
report = classification_report(y_test_mapped, y_pred, output_dict=True)
print(classification_report(y_test_mapped, y_pred))
accuracy = report['accuracy']
accuracies.append(accuracy)
# Confusion matrix
cm = confusion_matrix(y_test_mapped, y_pred, labels=[0, 1, 2])
if accumulated_cm is None:
accumulated_cm = cm
else:
accumulated_cm += cm
# Final summary
print(np.mean(accuracies), accuracies)
disp = ConfusionMatrixDisplay(confusion_matrix=accumulated_cm, display_labels=["WT", "TBK1/OPTN", "FUS"])
disp.plot(xticks_rotation=0)
plt.title("Combined Confusion Matrix Across Batches")
plt.tight_layout()
plt.show()
Train dataset
batches [2, 3, 7, 8, 9, 10]
(29383,) (29383, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
OPTN_Untreated: 7535
TBK1_Untreated: 4205
WT_Untreated: 7751
Test dataset
batches [1]
(8218,) (8218, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1222
FUSHomozygous_Untreated: 1245
OPTN_Untreated: 2314
TBK1_Untreated: 1876
WT_Untreated: 1561
fit
predict
precision recall f1-score support
0 0.11 0.21 0.15 1561
1 0.56 0.39 0.46 4190
2 1.00 0.99 1.00 2467
accuracy 0.54 8218
macro avg 0.56 0.53 0.53 8218
weighted avg 0.61 0.54 0.56 8218
Train dataset
batches [1, 3, 7, 8, 9, 10]
(31553,) (31553, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
OPTN_Untreated: 8200
TBK1_Untreated: 4861
WT_Untreated: 8526
Test dataset
batches [2]
(6048,) (6048, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1231
FUSHomozygous_Untreated: 1162
OPTN_Untreated: 1649
TBK1_Untreated: 1220
WT_Untreated: 786
fit
predict
precision recall f1-score support
0 0.29 0.38 0.33 786
1 0.80 0.74 0.77 2869
2 1.00 0.98 0.99 2393
accuracy 0.79 6048
macro avg 0.70 0.70 0.70 6048
weighted avg 0.81 0.79 0.80 6048
Train dataset
batches [1, 2, 7, 8, 9, 10]
(32731,) (32731, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
OPTN_Untreated: 8746
TBK1_Untreated: 5036
WT_Untreated: 8394
Test dataset
batches [3]
(4870,) (4870, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1004
FUSHomozygous_Untreated: 800
OPTN_Untreated: 1103
TBK1_Untreated: 1045
WT_Untreated: 918
fit
predict
precision recall f1-score support
0 0.26 0.30 0.28 918
1 0.67 0.64 0.65 2148
2 1.00 0.99 0.99 1804
accuracy 0.70 4870
macro avg 0.64 0.64 0.64 4870
weighted avg 0.72 0.70 0.71 4870
Train dataset
batches [1, 2, 3, 8, 9, 10]
(37495,) (37495, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
OPTN_Untreated: 9844
TBK1_Untreated: 6068
WT_Untreated: 9275
Test dataset
batches [7]
(106,) (106, 5568) [0 1 2]
FUSHeterozygous_Untreated: 26
FUSHomozygous_Untreated: 25
OPTN_Untreated: 5
TBK1_Untreated: 13
WT_Untreated: 37
fit
predict
precision recall f1-score support
0 0.00 0.00 0.00 37
1 0.18 1.00 0.31 18
2 1.00 0.14 0.24 51
accuracy 0.24 106
macro avg 0.39 0.38 0.18 106
weighted avg 0.51 0.24 0.17 106
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
Train dataset
batches [1, 2, 3, 7, 9, 10]
(30778,) (30778, 5568) [0 1 2]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
OPTN_Untreated: 8420
TBK1_Untreated: 5326
WT_Untreated: 7802
Test dataset
batches [8]
(6823,) (6823, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1567
FUSHomozygous_Untreated: 1562
OPTN_Untreated: 1429
TBK1_Untreated: 755
WT_Untreated: 1510
fit
predict
precision recall f1-score support
0 0.41 0.39 0.40 1510
1 0.58 0.61 0.60 2184
2 1.00 0.99 1.00 3129
accuracy 0.74 6823
macro avg 0.66 0.66 0.66 6823
weighted avg 0.74 0.74 0.74 6823
Train dataset
batches [1, 2, 3, 7, 8, 10]
(31145,) (31145, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
OPTN_Untreated: 8263
TBK1_Untreated: 5097
WT_Untreated: 7008
Test dataset
batches [9]
(6456,) (6456, 5568) [0 1 2]
FUSHeterozygous_Untreated: 963
FUSHomozygous_Untreated: 619
OPTN_Untreated: 1586
TBK1_Untreated: 984
WT_Untreated: 2304
fit
predict
precision recall f1-score support
0 1.00 0.00 0.00 2304
1 0.27 0.34 0.30 2570
2 0.48 1.00 0.65 1582
accuracy 0.38 6456
macro avg 0.58 0.45 0.32 6456
weighted avg 0.58 0.38 0.28 6456
Train dataset
batches [1, 2, 3, 7, 8, 9]
(32521,) (32521, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
OPTN_Untreated: 8086
TBK1_Untreated: 5893
WT_Untreated: 7116
Test dataset
batches [10]
(5080,) (5080, 5568) [0 1 2]
FUSHeterozygous_Untreated: 267
FUSHomozygous_Untreated: 666
OPTN_Untreated: 1763
TBK1_Untreated: 188
WT_Untreated: 2196
fit
predict
precision recall f1-score support
0 0.54 0.46 0.50 2196
1 0.48 0.57 0.52 1951
2 1.00 0.99 1.00 933
accuracy 0.60 5080
macro avg 0.67 0.67 0.67 5080
weighted avg 0.60 0.60 0.60 5080
0.5682045972728116 [0.5355317595522024, 0.7895171957671958, 0.7026694045174537, 0.2358490566037736, 0.7361864282573648, 0.37964684014869887, 0.5980314960629921]
## Feature selection + bal =>not as good
batches = [1,2,3,7,8,9,10]
accuracies = []
accumulated_cm = None
# Label mapping
label_map = {
'WT_Untreated': 0,
'TBK1_Untreated': 1,
'OPTN_Untreated': 1,
'FUSHeterozygous_Untreated': 2,
'FUSHomozygous_Untreated': 2,
}
excluded_labels = {'FUSRevertant_Untreated', 'TDP43_Untreated'}
for test_batch in batches:
test_batches = [test_batch]
train_batches = list(set(batches) - set(test_batches))
X_train, y_train = load_batches(train_batches)
X_test, y_test = load_batches(test_batches)
# Filter out unwanted labels
train_mask = ~np.isin(y_train, list(excluded_labels))
test_mask = ~np.isin(y_test, list(excluded_labels))
X_train, y_train = X_train[train_mask], y_train[train_mask]
X_test, y_test = X_test[test_mask], y_test[test_mask]
# Map labels to 0,1,2
y_train_mapped = np.array([label_map[l] for l in y_train])
y_test_mapped = np.array([label_map[l] for l in y_test])
print('Train dataset')
print('batches', train_batches)
print(np.shape(y_train_mapped), np.shape(X_train), np.unique(y_train_mapped))
count_labels(y_train)
print('Test dataset')
print('batches', test_batches)
print(np.shape(y_test_mapped), np.shape(X_test), np.unique(y_test_mapped))
count_labels(y_test)
# Optional: balance
balance = True
if balance:
ros = RandomOverSampler(random_state=42)
X_train, y_train_mapped = ros.fit_resample(X_train, y_train_mapped)
# Optional: normalize
norm = False
if norm:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
top_features = get_top_features(X_train, y_train_mapped, 100)
X_train = X_train[:, top_features]
X_test = X_test[:, top_features]
# To GPU
X_train = cudf.DataFrame.from_records(X_train)
X_test = cudf.DataFrame.from_records(X_test)
y_train_mapped = cudf.Series(y_train_mapped)
# Train
print('fit')
clf = cuMLLogisticRegression(verbose=1)
clf.fit(X_train, y_train_mapped)
# Predict
print('predict')
y_pred = clf.predict(X_test).to_numpy()
report = classification_report(y_test_mapped, y_pred, output_dict=True)
print(classification_report(y_test_mapped, y_pred))
accuracy = report['accuracy']
accuracies.append(accuracy)
# Confusion matrix
cm = confusion_matrix(y_test_mapped, y_pred, labels=[0, 1, 2])
if accumulated_cm is None:
accumulated_cm = cm
else:
accumulated_cm += cm
# Final summary
print(np.mean(accuracies), accuracies)
disp = ConfusionMatrixDisplay(confusion_matrix=accumulated_cm, display_labels=["WT", "TBK1/OPTN", "FUS"])
disp.plot(xticks_rotation=0)
plt.title("Combined Confusion Matrix Across Batches")
plt.tight_layout()
plt.show()
Train dataset
batches [2, 3, 7, 8, 9, 10]
(29383,) (29383, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
OPTN_Untreated: 7535
TBK1_Untreated: 4205
WT_Untreated: 7751
Test dataset
batches [1]
(8218,) (8218, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1222
FUSHomozygous_Untreated: 1245
OPTN_Untreated: 2314
TBK1_Untreated: 1876
WT_Untreated: 1561
fit
predict
precision recall f1-score support
0 0.19 0.49 0.27 1561
1 0.52 0.21 0.30 4190
2 1.00 0.99 1.00 2467
accuracy 0.50 8218
macro avg 0.57 0.56 0.52 8218
weighted avg 0.60 0.50 0.50 8218
Train dataset
batches [1, 3, 7, 8, 9, 10]
(31553,) (31553, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
OPTN_Untreated: 8200
TBK1_Untreated: 4861
WT_Untreated: 8526
Test dataset
batches [2]
(6048,) (6048, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1231
FUSHomozygous_Untreated: 1162
OPTN_Untreated: 1649
TBK1_Untreated: 1220
WT_Untreated: 786
fit
predict
precision recall f1-score support
0 0.25 0.68 0.37 786
1 0.82 0.45 0.58 2869
2 1.00 0.98 0.99 2393
accuracy 0.69 6048
macro avg 0.69 0.70 0.65 6048
weighted avg 0.82 0.69 0.71 6048
Train dataset
batches [1, 2, 7, 8, 9, 10]
(32731,) (32731, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
OPTN_Untreated: 8746
TBK1_Untreated: 5036
WT_Untreated: 8394
Test dataset
batches [3]
(4870,) (4870, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1004
FUSHomozygous_Untreated: 800
OPTN_Untreated: 1103
TBK1_Untreated: 1045
WT_Untreated: 918
fit
predict
precision recall f1-score support
0 0.26 0.57 0.36 918
1 0.61 0.30 0.41 2148
2 1.00 0.99 0.99 1804
accuracy 0.61 4870
macro avg 0.62 0.62 0.59 4870
weighted avg 0.69 0.61 0.62 4870
Train dataset
batches [1, 2, 3, 8, 9, 10]
(37495,) (37495, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
OPTN_Untreated: 9844
TBK1_Untreated: 6068
WT_Untreated: 9275
Test dataset
batches [7]
(106,) (106, 5568) [0 1 2]
FUSHeterozygous_Untreated: 26
FUSHomozygous_Untreated: 25
OPTN_Untreated: 5
TBK1_Untreated: 13
WT_Untreated: 37
fit
predict
precision recall f1-score support
0 0.00 0.00 0.00 37
1 0.18 1.00 0.31 18
2 1.00 0.16 0.27 51
accuracy 0.25 106
macro avg 0.39 0.39 0.19 106
weighted avg 0.51 0.25 0.18 106
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
Train dataset
batches [1, 2, 3, 7, 9, 10]
(30778,) (30778, 5568) [0 1 2]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
OPTN_Untreated: 8420
TBK1_Untreated: 5326
WT_Untreated: 7802
Test dataset
batches [8]
(6823,) (6823, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1567
FUSHomozygous_Untreated: 1562
OPTN_Untreated: 1429
TBK1_Untreated: 755
WT_Untreated: 1510
fit
predict
precision recall f1-score support
0 0.40 0.62 0.49 1510
1 0.57 0.37 0.45 2184
2 1.00 0.99 1.00 3129
accuracy 0.71 6823
macro avg 0.66 0.66 0.64 6823
weighted avg 0.73 0.71 0.71 6823
Train dataset
batches [1, 2, 3, 7, 8, 10]
(31145,) (31145, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
OPTN_Untreated: 8263
TBK1_Untreated: 5097
WT_Untreated: 7008
Test dataset
batches [9]
(6456,) (6456, 5568) [0 1 2]
FUSHeterozygous_Untreated: 963
FUSHomozygous_Untreated: 619
OPTN_Untreated: 1586
TBK1_Untreated: 984
WT_Untreated: 2304
fit
predict
precision recall f1-score support
0 0.95 0.02 0.03 2304
1 0.27 0.33 0.30 2570
2 0.48 1.00 0.65 1582
accuracy 0.38 6456
macro avg 0.57 0.45 0.33 6456
weighted avg 0.57 0.38 0.29 6456
Train dataset
batches [1, 2, 3, 7, 8, 9]
(32521,) (32521, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
OPTN_Untreated: 8086
TBK1_Untreated: 5893
WT_Untreated: 7116
Test dataset
batches [10]
(5080,) (5080, 5568) [0 1 2]
FUSHeterozygous_Untreated: 267
FUSHomozygous_Untreated: 666
OPTN_Untreated: 1763
TBK1_Untreated: 188
WT_Untreated: 2196
fit
predict
precision recall f1-score support
0 0.55 0.91 0.68 2196
1 0.58 0.15 0.24 1951
2 1.00 0.99 1.00 933
accuracy 0.63 5080
macro avg 0.71 0.68 0.64 5080
weighted avg 0.64 0.63 0.57 5080
0.538174871366586 [0.4969578972986128, 0.6899801587301587, 0.609034907597536, 0.24528301886792453, 0.7103913234647515, 0.3828996282527881, 0.6326771653543307]
## Feature selection +bal+norm =>not as good
batches = [1,2,3,7,8,9,10]
accuracies = []
accumulated_cm = None
# Label mapping
label_map = {
'WT_Untreated': 0,
'TBK1_Untreated': 1,
'OPTN_Untreated': 1,
'FUSHeterozygous_Untreated': 2,
'FUSHomozygous_Untreated': 2,
}
excluded_labels = {'FUSRevertant_Untreated', 'TDP43_Untreated'}
for test_batch in batches:
test_batches = [test_batch]
train_batches = list(set(batches) - set(test_batches))
X_train, y_train = load_batches(train_batches)
X_test, y_test = load_batches(test_batches)
# Filter out unwanted labels
train_mask = ~np.isin(y_train, list(excluded_labels))
test_mask = ~np.isin(y_test, list(excluded_labels))
X_train, y_train = X_train[train_mask], y_train[train_mask]
X_test, y_test = X_test[test_mask], y_test[test_mask]
# Map labels to 0,1,2
y_train_mapped = np.array([label_map[l] for l in y_train])
y_test_mapped = np.array([label_map[l] for l in y_test])
print('Train dataset')
print('batches', train_batches)
print(np.shape(y_train_mapped), np.shape(X_train), np.unique(y_train_mapped))
count_labels(y_train)
print('Test dataset')
print('batches', test_batches)
print(np.shape(y_test_mapped), np.shape(X_test), np.unique(y_test_mapped))
count_labels(y_test)
# Optional: balance
balance = True
if balance:
ros = RandomOverSampler(random_state=42)
X_train, y_train_mapped = ros.fit_resample(X_train, y_train_mapped)
# Optional: normalize
norm = True
if norm:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
top_features = get_top_features(X_train, y_train_mapped, 100)
X_train = X_train[:, top_features]
X_test = X_test[:, top_features]
# To GPU
X_train = cudf.DataFrame.from_records(X_train)
X_test = cudf.DataFrame.from_records(X_test)
y_train_mapped = cudf.Series(y_train_mapped)
# Train
print('fit')
clf = cuMLLogisticRegression(verbose=1)
clf.fit(X_train, y_train_mapped)
# Predict
print('predict')
y_pred = clf.predict(X_test).to_numpy()
report = classification_report(y_test_mapped, y_pred, output_dict=True)
print(classification_report(y_test_mapped, y_pred))
accuracy = report['accuracy']
accuracies.append(accuracy)
# Confusion matrix
cm = confusion_matrix(y_test_mapped, y_pred, labels=[0, 1, 2])
if accumulated_cm is None:
accumulated_cm = cm
else:
accumulated_cm += cm
# Final summary
print(np.mean(accuracies), accuracies)
disp = ConfusionMatrixDisplay(confusion_matrix=accumulated_cm, display_labels=["WT", "TBK1/OPTN", "FUS"])
disp.plot(xticks_rotation=0)
plt.title("Combined Confusion Matrix Across Batches")
plt.tight_layout()
plt.show()
Train dataset
batches [2, 3, 7, 8, 9, 10]
(29383,) (29383, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
OPTN_Untreated: 7535
TBK1_Untreated: 4205
WT_Untreated: 7751
Test dataset
batches [1]
(8218,) (8218, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1222
FUSHomozygous_Untreated: 1245
OPTN_Untreated: 2314
TBK1_Untreated: 1876
WT_Untreated: 1561
fit
predict
precision recall f1-score support
0 0.19 0.49 0.27 1561
1 0.53 0.22 0.31 4190
2 1.00 1.00 1.00 2467
accuracy 0.50 8218
macro avg 0.57 0.57 0.53 8218
weighted avg 0.61 0.50 0.51 8218
Train dataset
batches [1, 3, 7, 8, 9, 10]
(31553,) (31553, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
OPTN_Untreated: 8200
TBK1_Untreated: 4861
WT_Untreated: 8526
Test dataset
batches [2]
(6048,) (6048, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1231
FUSHomozygous_Untreated: 1162
OPTN_Untreated: 1649
TBK1_Untreated: 1220
WT_Untreated: 786
fit
predict
precision recall f1-score support
0 0.26 0.59 0.36 786
1 0.82 0.53 0.65 2869
2 1.00 1.00 1.00 2393
accuracy 0.72 6048
macro avg 0.69 0.71 0.67 6048
weighted avg 0.82 0.72 0.75 6048
Train dataset
batches [1, 2, 7, 8, 9, 10]
(32731,) (32731, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
OPTN_Untreated: 8746
TBK1_Untreated: 5036
WT_Untreated: 8394
Test dataset
batches [3]
(4870,) (4870, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1004
FUSHomozygous_Untreated: 800
OPTN_Untreated: 1103
TBK1_Untreated: 1045
WT_Untreated: 918
fit
predict
precision recall f1-score support
0 0.28 0.54 0.37 918
1 0.67 0.40 0.50 2148
2 1.00 1.00 1.00 1804
accuracy 0.65 4870
macro avg 0.65 0.65 0.62 4870
weighted avg 0.72 0.65 0.66 4870
Train dataset
batches [1, 2, 3, 8, 9, 10]
(37495,) (37495, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
OPTN_Untreated: 9844
TBK1_Untreated: 6068
WT_Untreated: 9275
Test dataset
batches [7]
(106,) (106, 5568) [0 1 2]
FUSHeterozygous_Untreated: 26
FUSHomozygous_Untreated: 25
OPTN_Untreated: 5
TBK1_Untreated: 13
WT_Untreated: 37
fit
predict
precision recall f1-score support
0 1.00 0.03 0.05 37
1 0.30 0.33 0.32 18
2 0.48 0.80 0.60 51
accuracy 0.45 106
macro avg 0.59 0.39 0.32 106
weighted avg 0.63 0.45 0.36 106
Train dataset
batches [1, 2, 3, 7, 9, 10]
(30778,) (30778, 5568) [0 1 2]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
OPTN_Untreated: 8420
TBK1_Untreated: 5326
WT_Untreated: 7802
Test dataset
batches [8]
(6823,) (6823, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1567
FUSHomozygous_Untreated: 1562
OPTN_Untreated: 1429
TBK1_Untreated: 755
WT_Untreated: 1510
fit
predict
precision recall f1-score support
0 0.38 0.60 0.47 1510
1 0.55 0.34 0.42 2184
2 1.00 1.00 1.00 3129
accuracy 0.70 6823
macro avg 0.64 0.64 0.63 6823
weighted avg 0.72 0.70 0.70 6823
Train dataset
batches [1, 2, 3, 7, 8, 10]
(31145,) (31145, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
OPTN_Untreated: 8263
TBK1_Untreated: 5097
WT_Untreated: 7008
Test dataset
batches [9]
(6456,) (6456, 5568) [0 1 2]
FUSHeterozygous_Untreated: 963
FUSHomozygous_Untreated: 619
OPTN_Untreated: 1586
TBK1_Untreated: 984
WT_Untreated: 2304
fit
predict
precision recall f1-score support
0 1.00 0.02 0.04 2304
1 0.31 0.40 0.35 2570
2 0.50 1.00 0.67 1582
accuracy 0.41 6456
macro avg 0.61 0.47 0.35 6456
weighted avg 0.60 0.41 0.32 6456
Train dataset
batches [1, 2, 3, 7, 8, 9]
(32521,) (32521, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
OPTN_Untreated: 8086
TBK1_Untreated: 5893
WT_Untreated: 7116
Test dataset
batches [10]
(5080,) (5080, 5568) [0 1 2]
FUSHeterozygous_Untreated: 267
FUSHomozygous_Untreated: 666
OPTN_Untreated: 1763
TBK1_Untreated: 188
WT_Untreated: 2196
fit
predict
precision recall f1-score support
0 0.55 0.88 0.68 2196
1 0.58 0.19 0.29 1951
2 1.00 1.00 1.00 933
accuracy 0.64 5080
macro avg 0.71 0.69 0.65 5080
weighted avg 0.64 0.64 0.59 5080
0.5819310102465074 [0.5046239961061085, 0.7230489417989417, 0.6486652977412731, 0.4528301886792453, 0.6980800234500952, 0.40985130111524165, 0.6364173228346457]
## No feature selection +bal =>same
batches = [1,2,3,7,8,9,10]
accuracies = []
accumulated_cm = None
# Label mapping
label_map = {
'WT_Untreated': 0,
'TBK1_Untreated': 1,
'OPTN_Untreated': 1,
'FUSHeterozygous_Untreated': 2,
'FUSHomozygous_Untreated': 2,
}
excluded_labels = {'FUSRevertant_Untreated', 'TDP43_Untreated'}
for test_batch in batches:
test_batches = [test_batch]
train_batches = list(set(batches) - set(test_batches))
X_train, y_train = load_batches(train_batches)
X_test, y_test = load_batches(test_batches)
# Filter out unwanted labels
train_mask = ~np.isin(y_train, list(excluded_labels))
test_mask = ~np.isin(y_test, list(excluded_labels))
X_train, y_train = X_train[train_mask], y_train[train_mask]
X_test, y_test = X_test[test_mask], y_test[test_mask]
# Map labels to 0,1,2
y_train_mapped = np.array([label_map[l] for l in y_train])
y_test_mapped = np.array([label_map[l] for l in y_test])
print('Train dataset')
print('batches', train_batches)
print(np.shape(y_train_mapped), np.shape(X_train), np.unique(y_train_mapped))
count_labels(y_train)
print('Test dataset')
print('batches', test_batches)
print(np.shape(y_test_mapped), np.shape(X_test), np.unique(y_test_mapped))
count_labels(y_test)
# Optional: balance
balance = True
if balance:
ros = RandomOverSampler(random_state=42)
X_train, y_train_mapped = ros.fit_resample(X_train, y_train_mapped)
# Optional: normalize
norm = False
if norm:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# top_features = get_top_features(X_train, y_train_mapped, 100)
# X_train = X_train[:, top_features]
# X_test = X_test[:, top_features]
# To GPU
X_train = cudf.DataFrame.from_records(X_train)
X_test = cudf.DataFrame.from_records(X_test)
y_train_mapped = cudf.Series(y_train_mapped)
# Train
print('fit')
clf = cuMLLogisticRegression(verbose=1)
clf.fit(X_train, y_train_mapped)
# Predict
print('predict')
y_pred = clf.predict(X_test).to_numpy()
report = classification_report(y_test_mapped, y_pred, output_dict=True)
print(classification_report(y_test_mapped, y_pred))
accuracy = report['accuracy']
accuracies.append(accuracy)
# Confusion matrix
cm = confusion_matrix(y_test_mapped, y_pred, labels=[0, 1, 2])
if accumulated_cm is None:
accumulated_cm = cm
else:
accumulated_cm += cm
# Final summary
print(np.mean(accuracies), accuracies)
disp = ConfusionMatrixDisplay(confusion_matrix=accumulated_cm, display_labels=["WT", "TBK1/OPTN", "FUS"])
disp.plot(xticks_rotation=0)
plt.title("Combined Confusion Matrix Across Batches")
plt.tight_layout()
plt.show()
Train dataset
batches [2, 3, 7, 8, 9, 10]
(29383,) (29383, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
OPTN_Untreated: 7535
TBK1_Untreated: 4205
WT_Untreated: 7751
Test dataset
batches [1]
(8218,) (8218, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1222
FUSHomozygous_Untreated: 1245
OPTN_Untreated: 2314
TBK1_Untreated: 1876
WT_Untreated: 1561
fit
predict
precision recall f1-score support
0 0.37 0.97 0.54 1561
1 0.98 0.39 0.56 4190
2 1.00 1.00 1.00 2467
accuracy 0.68 8218
macro avg 0.78 0.79 0.70 8218
weighted avg 0.87 0.68 0.69 8218
Train dataset
batches [1, 3, 7, 8, 9, 10]
(31553,) (31553, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
OPTN_Untreated: 8200
TBK1_Untreated: 4861
WT_Untreated: 8526
Test dataset
batches [2]
(6048,) (6048, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1231
FUSHomozygous_Untreated: 1162
OPTN_Untreated: 1649
TBK1_Untreated: 1220
WT_Untreated: 786
fit
predict
precision recall f1-score support
0 0.86 0.65 0.74 786
1 0.90 0.97 0.94 2869
2 1.00 0.99 1.00 2393
accuracy 0.94 6048
macro avg 0.92 0.87 0.89 6048
weighted avg 0.94 0.94 0.93 6048
Train dataset
batches [1, 2, 7, 8, 9, 10]
(32731,) (32731, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
OPTN_Untreated: 8746
TBK1_Untreated: 5036
WT_Untreated: 8394
Test dataset
batches [3]
(4870,) (4870, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1004
FUSHomozygous_Untreated: 800
OPTN_Untreated: 1103
TBK1_Untreated: 1045
WT_Untreated: 918
fit
predict
precision recall f1-score support
0 0.86 0.24 0.38 918
1 0.75 0.98 0.85 2148
2 1.00 1.00 1.00 1804
accuracy 0.85 4870
macro avg 0.87 0.74 0.74 4870
weighted avg 0.86 0.85 0.82 4870
Train dataset
batches [1, 2, 3, 8, 9, 10]
(37495,) (37495, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
OPTN_Untreated: 9844
TBK1_Untreated: 6068
WT_Untreated: 9275
Test dataset
batches [7]
(106,) (106, 5568) [0 1 2]
FUSHeterozygous_Untreated: 26
FUSHomozygous_Untreated: 25
OPTN_Untreated: 5
TBK1_Untreated: 13
WT_Untreated: 37
fit
predict
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
precision recall f1-score support
0 0.51 1.00 0.67 37
1 0.55 1.00 0.71 18
2 0.00 0.00 0.00 51
accuracy 0.52 106
macro avg 0.35 0.67 0.46 106
weighted avg 0.27 0.52 0.35 106
Train dataset
batches [1, 2, 3, 7, 9, 10]
(30778,) (30778, 5568) [0 1 2]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
OPTN_Untreated: 8420
TBK1_Untreated: 5326
WT_Untreated: 7802
Test dataset
batches [8]
(6823,) (6823, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1567
FUSHomozygous_Untreated: 1562
OPTN_Untreated: 1429
TBK1_Untreated: 755
WT_Untreated: 1510
fit
predict
precision recall f1-score support
0 0.74 0.96 0.83 1510
1 0.97 0.76 0.85 2184
2 1.00 1.00 1.00 3129
accuracy 0.92 6823
macro avg 0.90 0.91 0.90 6823
weighted avg 0.93 0.92 0.92 6823
Train dataset
batches [1, 2, 3, 7, 8, 10]
(31145,) (31145, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
OPTN_Untreated: 8263
TBK1_Untreated: 5097
WT_Untreated: 7008
Test dataset
batches [9]
(6456,) (6456, 5568) [0 1 2]
FUSHeterozygous_Untreated: 963
FUSHomozygous_Untreated: 619
OPTN_Untreated: 1586
TBK1_Untreated: 984
WT_Untreated: 2304
fit
predict
precision recall f1-score support
0 0.99 0.32 0.49 2304
1 0.45 0.49 0.47 2570
2 0.55 1.00 0.71 1582
accuracy 0.56 6456
macro avg 0.66 0.60 0.55 6456
weighted avg 0.67 0.56 0.53 6456
Train dataset
batches [1, 2, 3, 7, 8, 9]
(32521,) (32521, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
OPTN_Untreated: 8086
TBK1_Untreated: 5893
WT_Untreated: 7116
Test dataset
batches [10]
(5080,) (5080, 5568) [0 1 2]
FUSHeterozygous_Untreated: 267
FUSHomozygous_Untreated: 666
OPTN_Untreated: 1763
TBK1_Untreated: 188
WT_Untreated: 2196
fit
predict
precision recall f1-score support
0 0.61 1.00 0.76 2196
1 0.99 0.27 0.43 1951
2 1.00 1.00 1.00 933
accuracy 0.72 5080
macro avg 0.87 0.76 0.73 5080
weighted avg 0.83 0.72 0.67 5080
0.7398777954817167 [0.6832562667315648, 0.9375, 0.848870636550308, 0.5188679245283019, 0.9152865308515316, 0.5560718711276332, 0.7192913385826771]
## add regularization c=0.1 - not improving
batches = [1,2,3,7,8,9,10]
accuracies = []
accumulated_cm = None
# Label mapping
label_map = {
'WT_Untreated': 0,
'TBK1_Untreated': 1,
'OPTN_Untreated': 1,
'FUSHeterozygous_Untreated': 2,
'FUSHomozygous_Untreated': 2,
}
excluded_labels = {'FUSRevertant_Untreated', 'TDP43_Untreated'}
for test_batch in batches:
test_batches = [test_batch]
train_batches = list(set(batches) - set(test_batches))
X_train, y_train = load_batches(train_batches)
X_test, y_test = load_batches(test_batches)
# Filter out unwanted labels
train_mask = ~np.isin(y_train, list(excluded_labels))
test_mask = ~np.isin(y_test, list(excluded_labels))
X_train, y_train = X_train[train_mask], y_train[train_mask]
X_test, y_test = X_test[test_mask], y_test[test_mask]
# Map labels to 0,1,2
y_train_mapped = np.array([label_map[l] for l in y_train])
y_test_mapped = np.array([label_map[l] for l in y_test])
print('Train dataset')
print('batches', train_batches)
print(np.shape(y_train_mapped), np.shape(X_train), np.unique(y_train_mapped))
count_labels(y_train)
print('Test dataset')
print('batches', test_batches)
print(np.shape(y_test_mapped), np.shape(X_test), np.unique(y_test_mapped))
count_labels(y_test)
# Optional: balance
balance = False
if balance:
ros = RandomOverSampler(random_state=42)
X_train, y_train_mapped = ros.fit_resample(X_train, y_train_mapped)
# Optional: normalize
norm = False
if norm:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# To GPU
X_train = cudf.DataFrame.from_records(X_train)
X_test = cudf.DataFrame.from_records(X_test)
y_train_mapped = cudf.Series(y_train_mapped)
# Train
print('fit')
clf = cuMLLogisticRegression(verbose=1, c=0.1)
clf.fit(X_train, y_train_mapped)
# Predict
print('predict')
y_pred = clf.predict(X_test).to_numpy()
report = classification_report(y_test_mapped, y_pred, output_dict=True)
print(classification_report(y_test_mapped, y_pred))
accuracy = report['accuracy']
accuracies.append(accuracy)
# Confusion matrix
cm = confusion_matrix(y_test_mapped, y_pred, labels=[0, 1, 2])
if accumulated_cm is None:
accumulated_cm = cm
else:
accumulated_cm += cm
# Final summary
print(np.mean(accuracies), accuracies)
disp = ConfusionMatrixDisplay(confusion_matrix=accumulated_cm, display_labels=["WT", "TBK1/OPTN", "FUS"])
disp.plot(xticks_rotation=0)
plt.title("Combined Confusion Matrix Across Batches")
plt.tight_layout()
plt.show()
Train dataset
batches [2, 3, 7, 8, 9, 10]
(29383,) (29383, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
OPTN_Untreated: 7535
TBK1_Untreated: 4205
WT_Untreated: 7751
Test dataset
batches [1]
(8218,) (8218, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1222
FUSHomozygous_Untreated: 1245
OPTN_Untreated: 2314
TBK1_Untreated: 1876
WT_Untreated: 1561
fit
[I] [16:58:04.002392] Unused keyword parameter: c during cuML estimator initialization
predict
precision recall f1-score support
0 0.38 0.96 0.55 1561
1 0.97 0.42 0.59 4190
2 1.00 1.00 1.00 2467
accuracy 0.70 8218
macro avg 0.78 0.80 0.71 8218
weighted avg 0.87 0.70 0.71 8218
Train dataset
batches [1, 3, 7, 8, 9, 10]
(31553,) (31553, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
OPTN_Untreated: 8200
TBK1_Untreated: 4861
WT_Untreated: 8526
Test dataset
batches [2]
(6048,) (6048, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1231
FUSHomozygous_Untreated: 1162
OPTN_Untreated: 1649
TBK1_Untreated: 1220
WT_Untreated: 786
fit
predict
precision recall f1-score support
0 0.88 0.60 0.71 786
1 0.89 0.98 0.93 2869
2 1.00 0.99 1.00 2393
accuracy 0.93 6048
macro avg 0.92 0.86 0.88 6048
weighted avg 0.93 0.93 0.93 6048
Train dataset
batches [1, 2, 7, 8, 9, 10]
(32731,) (32731, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
OPTN_Untreated: 8746
TBK1_Untreated: 5036
WT_Untreated: 8394
Test dataset
batches [3]
(4870,) (4870, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1004
FUSHomozygous_Untreated: 800
OPTN_Untreated: 1103
TBK1_Untreated: 1045
WT_Untreated: 918
fit
predict
precision recall f1-score support
0 0.87 0.20 0.32 918
1 0.74 0.99 0.85 2148
2 1.00 1.00 1.00 1804
accuracy 0.84 4870
macro avg 0.87 0.73 0.72 4870
weighted avg 0.86 0.84 0.80 4870
Train dataset
batches [1, 2, 3, 8, 9, 10]
(37495,) (37495, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
OPTN_Untreated: 9844
TBK1_Untreated: 6068
WT_Untreated: 9275
Test dataset
batches [7]
(106,) (106, 5568) [0 1 2]
FUSHeterozygous_Untreated: 26
FUSHomozygous_Untreated: 25
OPTN_Untreated: 5
TBK1_Untreated: 13
WT_Untreated: 37
fit
predict
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
precision recall f1-score support
0 0.50 0.97 0.66 37
1 0.53 1.00 0.69 18
2 0.00 0.00 0.00 51
accuracy 0.51 106
macro avg 0.34 0.66 0.45 106
weighted avg 0.26 0.51 0.35 106
Train dataset
batches [1, 2, 3, 7, 9, 10]
(30778,) (30778, 5568) [0 1 2]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
OPTN_Untreated: 8420
TBK1_Untreated: 5326
WT_Untreated: 7802
Test dataset
batches [8]
(6823,) (6823, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1567
FUSHomozygous_Untreated: 1562
OPTN_Untreated: 1429
TBK1_Untreated: 755
WT_Untreated: 1510
fit
predict
precision recall f1-score support
0 0.76 0.95 0.85 1510
1 0.96 0.80 0.87 2184
2 1.00 1.00 1.00 3129
accuracy 0.92 6823
macro avg 0.91 0.91 0.90 6823
weighted avg 0.93 0.92 0.92 6823
Train dataset
batches [1, 2, 3, 7, 8, 10]
(31145,) (31145, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
OPTN_Untreated: 8263
TBK1_Untreated: 5097
WT_Untreated: 7008
Test dataset
batches [9]
(6456,) (6456, 5568) [0 1 2]
FUSHeterozygous_Untreated: 963
FUSHomozygous_Untreated: 619
OPTN_Untreated: 1586
TBK1_Untreated: 984
WT_Untreated: 2304
fit
predict
precision recall f1-score support
0 0.99 0.28 0.43 2304
1 0.45 0.52 0.48 2570
2 0.56 1.00 0.72 1582
accuracy 0.55 6456
macro avg 0.67 0.60 0.54 6456
weighted avg 0.67 0.55 0.52 6456
Train dataset
batches [1, 2, 3, 7, 8, 9]
(32521,) (32521, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
OPTN_Untreated: 8086
TBK1_Untreated: 5893
WT_Untreated: 7116
Test dataset
batches [10]
(5080,) (5080, 5568) [0 1 2]
FUSHeterozygous_Untreated: 267
FUSHomozygous_Untreated: 666
OPTN_Untreated: 1763
TBK1_Untreated: 188
WT_Untreated: 2196
fit
predict
precision recall f1-score support
0 0.62 1.00 0.77 2196
1 0.99 0.32 0.49 1951
2 1.00 1.00 1.00 933
accuracy 0.74 5080
macro avg 0.87 0.77 0.75 5080
weighted avg 0.84 0.74 0.70 5080
0.7425284397307202 [0.6993185689948893, 0.9333664021164021, 0.8425051334702258, 0.5094339622641509, 0.923054374908398, 0.5506505576208178, 0.7393700787401575]
## add regularization c=10
batches = [1,2,3,7,8,9,10]
accuracies = []
accumulated_cm = None
# Label mapping
label_map = {
'WT_Untreated': 0,
'TBK1_Untreated': 1,
'OPTN_Untreated': 1,
'FUSHeterozygous_Untreated': 2,
'FUSHomozygous_Untreated': 2,
}
excluded_labels = {'FUSRevertant_Untreated', 'TDP43_Untreated'}
for test_batch in batches:
test_batches = [test_batch]
train_batches = list(set(batches) - set(test_batches))
X_train, y_train = load_batches(train_batches)
X_test, y_test = load_batches(test_batches)
# Filter out unwanted labels
train_mask = ~np.isin(y_train, list(excluded_labels))
test_mask = ~np.isin(y_test, list(excluded_labels))
X_train, y_train = X_train[train_mask], y_train[train_mask]
X_test, y_test = X_test[test_mask], y_test[test_mask]
# Map labels to 0,1,2
y_train_mapped = np.array([label_map[l] for l in y_train])
y_test_mapped = np.array([label_map[l] for l in y_test])
print('Train dataset')
print('batches', train_batches)
print(np.shape(y_train_mapped), np.shape(X_train), np.unique(y_train_mapped))
count_labels(y_train)
print('Test dataset')
print('batches', test_batches)
print(np.shape(y_test_mapped), np.shape(X_test), np.unique(y_test_mapped))
count_labels(y_test)
# Optional: balance
balance = False
if balance:
ros = RandomOverSampler(random_state=42)
X_train, y_train_mapped = ros.fit_resample(X_train, y_train_mapped)
# Optional: normalize
norm = False
if norm:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# To GPU
X_train = cudf.DataFrame.from_records(X_train)
X_test = cudf.DataFrame.from_records(X_test)
y_train_mapped = cudf.Series(y_train_mapped)
# Train
print('fit')
clf = cuMLLogisticRegression(verbose=1, c=10)
clf.fit(X_train, y_train_mapped)
# Predict
print('predict')
y_pred = clf.predict(X_test).to_numpy()
report = classification_report(y_test_mapped, y_pred, output_dict=True)
print(classification_report(y_test_mapped, y_pred))
accuracy = report['accuracy']
accuracies.append(accuracy)
# Confusion matrix
cm = confusion_matrix(y_test_mapped, y_pred, labels=[0, 1, 2])
if accumulated_cm is None:
accumulated_cm = cm
else:
accumulated_cm += cm
# Final summary
print(np.mean(accuracies), accuracies)
disp = ConfusionMatrixDisplay(confusion_matrix=accumulated_cm, display_labels=["WT", "TBK1/OPTN", "FUS"])
disp.plot(xticks_rotation=0)
plt.title("Combined Confusion Matrix Across Batches")
plt.tight_layout()
plt.show()
Train dataset
batches [2, 3, 7, 8, 9, 10]
(29383,) (29383, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
OPTN_Untreated: 7535
TBK1_Untreated: 4205
WT_Untreated: 7751
Test dataset
batches [1]
(8218,) (8218, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1222
FUSHomozygous_Untreated: 1245
OPTN_Untreated: 2314
TBK1_Untreated: 1876
WT_Untreated: 1561
fit
predict
precision recall f1-score support
0 0.38 0.96 0.55 1561
1 0.97 0.42 0.59 4190
2 1.00 1.00 1.00 2467
accuracy 0.70 8218
macro avg 0.78 0.80 0.71 8218
weighted avg 0.87 0.70 0.70 8218
Train dataset
batches [1, 3, 7, 8, 9, 10]
(31553,) (31553, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
OPTN_Untreated: 8200
TBK1_Untreated: 4861
WT_Untreated: 8526
Test dataset
batches [2]
(6048,) (6048, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1231
FUSHomozygous_Untreated: 1162
OPTN_Untreated: 1649
TBK1_Untreated: 1220
WT_Untreated: 786
fit
predict
precision recall f1-score support
0 0.88 0.60 0.71 786
1 0.89 0.98 0.93 2869
2 1.00 0.99 1.00 2393
accuracy 0.93 6048
macro avg 0.92 0.86 0.88 6048
weighted avg 0.93 0.93 0.93 6048
Train dataset
batches [1, 2, 7, 8, 9, 10]
(32731,) (32731, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
OPTN_Untreated: 8746
TBK1_Untreated: 5036
WT_Untreated: 8394
Test dataset
batches [3]
(4870,) (4870, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1004
FUSHomozygous_Untreated: 800
OPTN_Untreated: 1103
TBK1_Untreated: 1045
WT_Untreated: 918
fit
predict
precision recall f1-score support
0 0.87 0.20 0.33 918
1 0.74 0.99 0.85 2148
2 1.00 1.00 1.00 1804
accuracy 0.84 4870
macro avg 0.87 0.73 0.72 4870
weighted avg 0.86 0.84 0.80 4870
Train dataset
batches [1, 2, 3, 8, 9, 10]
(37495,) (37495, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
OPTN_Untreated: 9844
TBK1_Untreated: 6068
WT_Untreated: 9275
Test dataset
batches [7]
(106,) (106, 5568) [0 1 2]
FUSHeterozygous_Untreated: 26
FUSHomozygous_Untreated: 25
OPTN_Untreated: 5
TBK1_Untreated: 13
WT_Untreated: 37
fit
predict
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
precision recall f1-score support
0 0.50 0.97 0.66 37
1 0.53 1.00 0.69 18
2 0.00 0.00 0.00 51
accuracy 0.51 106
macro avg 0.34 0.66 0.45 106
weighted avg 0.26 0.51 0.35 106
Train dataset
batches [1, 2, 3, 7, 9, 10]
(30778,) (30778, 5568) [0 1 2]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
OPTN_Untreated: 8420
TBK1_Untreated: 5326
WT_Untreated: 7802
Test dataset
batches [8]
(6823,) (6823, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1567
FUSHomozygous_Untreated: 1562
OPTN_Untreated: 1429
TBK1_Untreated: 755
WT_Untreated: 1510
fit
predict
precision recall f1-score support
0 0.76 0.95 0.85 1510
1 0.96 0.80 0.87 2184
2 1.00 1.00 1.00 3129
accuracy 0.92 6823
macro avg 0.91 0.91 0.90 6823
weighted avg 0.93 0.92 0.92 6823
Train dataset
batches [1, 2, 3, 7, 8, 10]
(31145,) (31145, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
OPTN_Untreated: 8263
TBK1_Untreated: 5097
WT_Untreated: 7008
Test dataset
batches [9]
(6456,) (6456, 5568) [0 1 2]
FUSHeterozygous_Untreated: 963
FUSHomozygous_Untreated: 619
OPTN_Untreated: 1586
TBK1_Untreated: 984
WT_Untreated: 2304
fit
predict
precision recall f1-score support
0 0.99 0.28 0.43 2304
1 0.45 0.52 0.48 2570
2 0.56 1.00 0.72 1582
accuracy 0.55 6456
macro avg 0.66 0.60 0.54 6456
weighted avg 0.67 0.55 0.52 6456
Train dataset
batches [1, 2, 3, 7, 8, 9]
(32521,) (32521, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
OPTN_Untreated: 8086
TBK1_Untreated: 5893
WT_Untreated: 7116
Test dataset
batches [10]
(5080,) (5080, 5568) [0 1 2]
FUSHeterozygous_Untreated: 267
FUSHomozygous_Untreated: 666
OPTN_Untreated: 1763
TBK1_Untreated: 188
WT_Untreated: 2196
fit
predict
precision recall f1-score support
0 0.62 1.00 0.77 2196
1 0.99 0.32 0.49 1951
2 1.00 1.00 1.00 933
accuracy 0.74 5080
macro avg 0.87 0.77 0.75 5080
weighted avg 0.83 0.74 0.70 5080
0.7422607518311682 [0.6983450961304454, 0.9340277777777778, 0.8427104722792608, 0.5094339622641509, 0.9232009380038106, 0.5497211895910781, 0.7383858267716535]
## RandomForestClassifier
batches = [1,2,3,7,8,9,10]
accuracies = []
accumulated_cm = None
# Label mapping
label_map = {
'WT_Untreated': 0,
'TBK1_Untreated': 1,
'OPTN_Untreated': 1,
'FUSHeterozygous_Untreated': 2,
'FUSHomozygous_Untreated': 2,
}
excluded_labels = {'FUSRevertant_Untreated', 'TDP43_Untreated'}
for test_batch in batches:
test_batches = [test_batch]
train_batches = list(set(batches) - set(test_batches))
X_train, y_train = load_batches(train_batches)
X_test, y_test = load_batches(test_batches)
# Filter out unwanted labels
train_mask = ~np.isin(y_train, list(excluded_labels))
test_mask = ~np.isin(y_test, list(excluded_labels))
X_train, y_train = X_train[train_mask], y_train[train_mask]
X_test, y_test = X_test[test_mask], y_test[test_mask]
# Map labels to 0,1,2
y_train_mapped = np.array([label_map[l] for l in y_train])
y_test_mapped = np.array([label_map[l] for l in y_test])
print('Train dataset')
print('batches', train_batches)
print(np.shape(y_train_mapped), np.shape(X_train), np.unique(y_train_mapped))
count_labels(y_train)
print('Test dataset')
print('batches', test_batches)
print(np.shape(y_test_mapped), np.shape(X_test), np.unique(y_test_mapped))
count_labels(y_test)
# Optional: balance
balance = True
if balance:
ros = RandomOverSampler(random_state=42)
X_train, y_train_mapped = ros.fit_resample(X_train, y_train_mapped)
# Optional: normalize
norm = False
if norm:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# To GPU
X_train = cudf.DataFrame.from_records(X_train)
X_test = cudf.DataFrame.from_records(X_test)
y_train_mapped = cudf.Series(y_train_mapped)
# Train
print('fit')
clf = RandomForestClassifier(n_estimators=100, max_depth=20)
clf.fit(X_train, y_train_mapped)
# Predict
print('predict')
y_pred = clf.predict(X_test).to_numpy()
report = classification_report(y_test_mapped, y_pred, output_dict=True)
print(classification_report(y_test_mapped, y_pred))
accuracy = report['accuracy']
accuracies.append(accuracy)
# Confusion matrix
cm = confusion_matrix(y_test_mapped, y_pred, labels=[0, 1, 2])
if accumulated_cm is None:
accumulated_cm = cm
else:
accumulated_cm += cm
# Final summary
print(np.mean(accuracies), accuracies)
disp = ConfusionMatrixDisplay(confusion_matrix=accumulated_cm, display_labels=["WT", "TBK1/OPTN", "FUS"])
disp.plot(xticks_rotation=0)
plt.title("Combined Confusion Matrix Across Batches")
plt.tight_layout()
plt.show()
Train dataset
batches [2, 3, 7, 8, 9, 10]
(29383,) (29383, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
OPTN_Untreated: 7535
TBK1_Untreated: 4205
WT_Untreated: 7751
Test dataset
batches [1]
(8218,) (8218, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1222
FUSHomozygous_Untreated: 1245
OPTN_Untreated: 2314
TBK1_Untreated: 1876
WT_Untreated: 1561
fit
predict
precision recall f1-score support
0 0.44 0.32 0.37 1561
1 0.77 0.85 0.81 4190
2 1.00 1.00 1.00 2467
accuracy 0.79 8218
macro avg 0.74 0.72 0.73 8218
weighted avg 0.78 0.79 0.78 8218
Train dataset
batches [1, 3, 7, 8, 9, 10]
(31553,) (31553, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
OPTN_Untreated: 8200
TBK1_Untreated: 4861
WT_Untreated: 8526
Test dataset
batches [2]
(6048,) (6048, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1231
FUSHomozygous_Untreated: 1162
OPTN_Untreated: 1649
TBK1_Untreated: 1220
WT_Untreated: 786
fit
predict
precision recall f1-score support
0 0.88 0.16 0.28 786
1 0.80 0.99 0.89 2869
2 1.00 0.98 0.99 2393
accuracy 0.88 6048
macro avg 0.89 0.71 0.72 6048
weighted avg 0.89 0.88 0.85 6048
Train dataset
batches [1, 2, 7, 8, 9, 10]
(32731,) (32731, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
OPTN_Untreated: 8746
TBK1_Untreated: 5036
WT_Untreated: 8394
Test dataset
batches [3]
(4870,) (4870, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1004
FUSHomozygous_Untreated: 800
OPTN_Untreated: 1103
TBK1_Untreated: 1045
WT_Untreated: 918
fit
predict
precision recall f1-score support
0 0.68 0.05 0.09 918
1 0.71 0.99 0.83 2148
2 1.00 1.00 1.00 1804
accuracy 0.82 4870
macro avg 0.79 0.68 0.64 4870
weighted avg 0.81 0.82 0.75 4870
Train dataset
batches [1, 2, 3, 8, 9, 10]
(37495,) (37495, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
OPTN_Untreated: 9844
TBK1_Untreated: 6068
WT_Untreated: 9275
Test dataset
batches [7]
(106,) (106, 5568) [0 1 2]
FUSHeterozygous_Untreated: 26
FUSHomozygous_Untreated: 25
OPTN_Untreated: 5
TBK1_Untreated: 13
WT_Untreated: 37
fit
predict
precision recall f1-score support
0 0.75 0.16 0.27 37
1 0.23 1.00 0.37 18
2 0.58 0.22 0.31 51
accuracy 0.33 106
macro avg 0.52 0.46 0.32 106
weighted avg 0.58 0.33 0.31 106
Train dataset
batches [1, 2, 3, 7, 9, 10]
(30778,) (30778, 5568) [0 1 2]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
OPTN_Untreated: 8420
TBK1_Untreated: 5326
WT_Untreated: 7802
Test dataset
batches [8]
(6823,) (6823, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1567
FUSHomozygous_Untreated: 1562
OPTN_Untreated: 1429
TBK1_Untreated: 755
WT_Untreated: 1510
fit
predict
precision recall f1-score support
0 0.72 0.60 0.66 1510
1 0.74 0.84 0.79 2184
2 1.00 0.99 0.99 3129
accuracy 0.86 6823
macro avg 0.82 0.81 0.81 6823
weighted avg 0.86 0.86 0.85 6823
Train dataset
batches [1, 2, 3, 7, 8, 10]
(31145,) (31145, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
OPTN_Untreated: 8263
TBK1_Untreated: 5097
WT_Untreated: 7008
Test dataset
batches [9]
(6456,) (6456, 5568) [0 1 2]
FUSHeterozygous_Untreated: 963
FUSHomozygous_Untreated: 619
OPTN_Untreated: 1586
TBK1_Untreated: 984
WT_Untreated: 2304
fit
predict
precision recall f1-score support
0 0.99 0.07 0.14 2304
1 0.25 0.28 0.27 2570
2 0.46 1.00 0.63 1582
accuracy 0.38 6456
macro avg 0.57 0.45 0.34 6456
weighted avg 0.57 0.38 0.31 6456
Train dataset
batches [1, 2, 3, 7, 8, 9]
(32521,) (32521, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
OPTN_Untreated: 8086
TBK1_Untreated: 5893
WT_Untreated: 7116
Test dataset
batches [10]
(5080,) (5080, 5568) [0 1 2]
FUSHeterozygous_Untreated: 267
FUSHomozygous_Untreated: 666
OPTN_Untreated: 1763
TBK1_Untreated: 188
WT_Untreated: 2196
fit
predict
precision recall f1-score support
0 0.60 1.00 0.75 2196
1 0.97 0.26 0.41 1951
2 1.00 0.99 1.00 933
accuracy 0.71 5080
macro avg 0.86 0.75 0.72 5080
weighted avg 0.82 0.71 0.67 5080
0.6819123024071383 [0.7937454368459479, 0.8814484126984127, 0.8151950718685832, 0.330188679245283, 0.8563681664956764, 0.3830545229244114, 0.7133858267716535]
## RandomForestClassifier
batches = [1,2,3,7,8,9,10]
accuracies = []
accumulated_cm = None
# Label mapping
label_map = {
'WT_Untreated': 0,
'TBK1_Untreated': 1,
'OPTN_Untreated': 1,
'FUSHeterozygous_Untreated': 2,
'FUSHomozygous_Untreated': 2,
}
excluded_labels = {'FUSRevertant_Untreated', 'TDP43_Untreated'}
for test_batch in batches:
test_batches = [test_batch]
train_batches = list(set(batches) - set(test_batches))
X_train, y_train = load_batches(train_batches)
X_test, y_test = load_batches(test_batches)
# Filter out unwanted labels
train_mask = ~np.isin(y_train, list(excluded_labels))
test_mask = ~np.isin(y_test, list(excluded_labels))
X_train, y_train = X_train[train_mask], y_train[train_mask]
X_test, y_test = X_test[test_mask], y_test[test_mask]
# Map labels to 0,1,2
y_train_mapped = np.array([label_map[l] for l in y_train])
y_test_mapped = np.array([label_map[l] for l in y_test])
print('Train dataset')
print('batches', train_batches)
print(np.shape(y_train_mapped), np.shape(X_train), np.unique(y_train_mapped))
count_labels(y_train)
print('Test dataset')
print('batches', test_batches)
print(np.shape(y_test_mapped), np.shape(X_test), np.unique(y_test_mapped))
count_labels(y_test)
# Optional: balance
balance = False
if balance:
ros = RandomOverSampler(random_state=42)
X_train, y_train_mapped = ros.fit_resample(X_train, y_train_mapped)
# Optional: normalize
norm = False
if norm:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# To GPU
X_train = cudf.DataFrame.from_records(X_train)
X_test = cudf.DataFrame.from_records(X_test)
y_train_mapped = cudf.Series(y_train_mapped)
# Train
print('fit')
clf = RandomForestClassifier(n_estimators=100, max_depth=20)
clf.fit(X_train, y_train_mapped)
# Predict
print('predict')
y_pred = clf.predict(X_test).to_numpy()
report = classification_report(y_test_mapped, y_pred, output_dict=True)
print(classification_report(y_test_mapped, y_pred))
accuracy = report['accuracy']
accuracies.append(accuracy)
# Confusion matrix
cm = confusion_matrix(y_test_mapped, y_pred, labels=[0, 1, 2])
if accumulated_cm is None:
accumulated_cm = cm
else:
accumulated_cm += cm
# Final summary
print(np.mean(accuracies), accuracies)
disp = ConfusionMatrixDisplay(confusion_matrix=accumulated_cm, display_labels=["WT", "TBK1/OPTN", "FUS"])
disp.plot(xticks_rotation=0)
plt.title("Combined Confusion Matrix Across Batches")
plt.tight_layout()
plt.show()
Train dataset
batches [2, 3, 7, 8, 9, 10]
(29383,) (29383, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
OPTN_Untreated: 7535
TBK1_Untreated: 4205
WT_Untreated: 7751
Test dataset
batches [1]
(8218,) (8218, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1222
FUSHomozygous_Untreated: 1245
OPTN_Untreated: 2314
TBK1_Untreated: 1876
WT_Untreated: 1561
fit
predict
precision recall f1-score support
0 0.46 0.21 0.28 1561
1 0.75 0.91 0.82 4190
2 1.00 1.00 1.00 2467
accuracy 0.80 8218
macro avg 0.74 0.70 0.70 8218
weighted avg 0.77 0.80 0.77 8218
Train dataset
batches [1, 3, 7, 8, 9, 10]
(31553,) (31553, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
OPTN_Untreated: 8200
TBK1_Untreated: 4861
WT_Untreated: 8526
Test dataset
batches [2]
(6048,) (6048, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1231
FUSHomozygous_Untreated: 1162
OPTN_Untreated: 1649
TBK1_Untreated: 1220
WT_Untreated: 786
fit
predict
precision recall f1-score support
0 0.92 0.11 0.20 786
1 0.79 1.00 0.88 2869
2 1.00 0.98 0.99 2393
accuracy 0.88 6048
macro avg 0.90 0.70 0.69 6048
weighted avg 0.89 0.88 0.84 6048
Train dataset
batches [1, 2, 7, 8, 9, 10]
(32731,) (32731, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
OPTN_Untreated: 8746
TBK1_Untreated: 5036
WT_Untreated: 8394
Test dataset
batches [3]
(4870,) (4870, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1004
FUSHomozygous_Untreated: 800
OPTN_Untreated: 1103
TBK1_Untreated: 1045
WT_Untreated: 918
fit
predict
precision recall f1-score support
0 0.65 0.02 0.04 918
1 0.70 0.99 0.82 2148
2 1.00 1.00 1.00 1804
accuracy 0.81 4870
macro avg 0.78 0.67 0.62 4870
weighted avg 0.80 0.81 0.74 4870
Train dataset
batches [1, 2, 3, 8, 9, 10]
(37495,) (37495, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
OPTN_Untreated: 9844
TBK1_Untreated: 6068
WT_Untreated: 9275
Test dataset
batches [7]
(106,) (106, 5568) [0 1 2]
FUSHeterozygous_Untreated: 26
FUSHomozygous_Untreated: 25
OPTN_Untreated: 5
TBK1_Untreated: 13
WT_Untreated: 37
fit
predict
precision recall f1-score support
0 0.80 0.11 0.19 37
1 0.28 0.94 0.44 18
2 0.54 0.43 0.48 51
accuracy 0.41 106
macro avg 0.54 0.49 0.37 106
weighted avg 0.59 0.41 0.37 106
Train dataset
batches [1, 2, 3, 7, 9, 10]
(30778,) (30778, 5568) [0 1 2]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
OPTN_Untreated: 8420
TBK1_Untreated: 5326
WT_Untreated: 7802
Test dataset
batches [8]
(6823,) (6823, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1567
FUSHomozygous_Untreated: 1562
OPTN_Untreated: 1429
TBK1_Untreated: 755
WT_Untreated: 1510
fit
predict
precision recall f1-score support
0 0.78 0.49 0.60 1510
1 0.71 0.91 0.80 2184
2 1.00 0.99 0.99 3129
accuracy 0.85 6823
macro avg 0.83 0.79 0.80 6823
weighted avg 0.86 0.85 0.84 6823
Train dataset
batches [1, 2, 3, 7, 8, 10]
(31145,) (31145, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
OPTN_Untreated: 8263
TBK1_Untreated: 5097
WT_Untreated: 7008
Test dataset
batches [9]
(6456,) (6456, 5568) [0 1 2]
FUSHeterozygous_Untreated: 963
FUSHomozygous_Untreated: 619
OPTN_Untreated: 1586
TBK1_Untreated: 984
WT_Untreated: 2304
fit
predict
precision recall f1-score support
0 1.00 0.07 0.14 2304
1 0.26 0.30 0.28 2570
2 0.47 1.00 0.64 1582
accuracy 0.39 6456
macro avg 0.58 0.46 0.35 6456
weighted avg 0.58 0.39 0.32 6456
Train dataset
batches [1, 2, 3, 7, 8, 9]
(32521,) (32521, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
OPTN_Untreated: 8086
TBK1_Untreated: 5893
WT_Untreated: 7116
Test dataset
batches [10]
(5080,) (5080, 5568) [0 1 2]
FUSHeterozygous_Untreated: 267
FUSHomozygous_Untreated: 666
OPTN_Untreated: 1763
TBK1_Untreated: 188
WT_Untreated: 2196
fit
predict
precision recall f1-score support
0 0.63 0.99 0.77 2196
1 0.96 0.35 0.51 1951
2 1.00 0.99 0.99 933
accuracy 0.74 5080
macro avg 0.86 0.78 0.76 5080
weighted avg 0.83 0.74 0.71 5080
0.6969770713683098 [0.8015332197614992, 0.875165343915344, 0.8123203285420945, 0.4056603773584906, 0.8505056426791734, 0.3895600991325898, 0.7440944881889764]
## RandomForestClassifier
balance = False
norm = False
choose_features=True
batches = [1,2,3,7,8,9,10]
accuracies = []
accumulated_cm = None
# Label mapping
label_map = {
'WT_Untreated': 0,
'TBK1_Untreated': 1,
'OPTN_Untreated': 1,
'FUSHeterozygous_Untreated': 2,
'FUSHomozygous_Untreated': 2,
}
excluded_labels = {'FUSRevertant_Untreated', 'TDP43_Untreated'}
for test_batch in batches:
test_batches = [test_batch]
train_batches = list(set(batches) - set(test_batches))
X_train, y_train = load_batches(train_batches)
X_test, y_test = load_batches(test_batches)
# Filter out unwanted labels
train_mask = ~np.isin(y_train, list(excluded_labels))
test_mask = ~np.isin(y_test, list(excluded_labels))
X_train, y_train = X_train[train_mask], y_train[train_mask]
X_test, y_test = X_test[test_mask], y_test[test_mask]
# Map labels to 0,1,2
y_train_mapped = np.array([label_map[l] for l in y_train])
y_test_mapped = np.array([label_map[l] for l in y_test])
print('Train dataset')
print('batches', train_batches)
print(np.shape(y_train_mapped), np.shape(X_train), np.unique(y_train_mapped))
count_labels(y_train)
print('Test dataset')
print('batches', test_batches)
print(np.shape(y_test_mapped), np.shape(X_test), np.unique(y_test_mapped))
count_labels(y_test)
# Optional: balance
if balance:
ros = RandomOverSampler(random_state=42)
X_train, y_train_mapped = ros.fit_resample(X_train, y_train_mapped)
# Optional: normalize
if norm:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
if choose_features:
top_features = get_top_features(X_train, y_train_mapped, 100)
X_train = X_train[:, top_features]
X_test = X_test[:, top_features]
# To GPU
X_train = cudf.DataFrame.from_records(X_train)
X_test = cudf.DataFrame.from_records(X_test)
y_train_mapped = cudf.Series(y_train_mapped)
# Train
print('fit')
clf = RandomForestClassifier(n_estimators=100, max_depth=20)
clf.fit(X_train, y_train_mapped)
# Predict
print('predict')
y_pred = clf.predict(X_test).to_numpy()
report = classification_report(y_test_mapped, y_pred, output_dict=True)
print(classification_report(y_test_mapped, y_pred))
accuracy = report['accuracy']
accuracies.append(accuracy)
# Confusion matrix
cm = confusion_matrix(y_test_mapped, y_pred, labels=[0, 1, 2])
if accumulated_cm is None:
accumulated_cm = cm
else:
accumulated_cm += cm
# Final summary
print(np.mean(accuracies), accuracies)
disp = ConfusionMatrixDisplay(confusion_matrix=accumulated_cm, display_labels=["WT", "TBK1/OPTN", "FUS"])
disp.plot(xticks_rotation=0)
plt.title("Combined Confusion Matrix Across Batches")
plt.tight_layout()
plt.show()
Train dataset
batches [2, 3, 7, 8, 9, 10]
(29383,) (29383, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
OPTN_Untreated: 7535
TBK1_Untreated: 4205
WT_Untreated: 7751
Test dataset
batches [1]
(8218,) (8218, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1222
FUSHomozygous_Untreated: 1245
OPTN_Untreated: 2314
TBK1_Untreated: 1876
WT_Untreated: 1561
fit
predict
precision recall f1-score support
0 0.08 0.15 0.11 1561
1 0.54 0.38 0.45 4190
2 1.00 0.99 1.00 2467
accuracy 0.52 8218
macro avg 0.54 0.51 0.52 8218
weighted avg 0.59 0.52 0.55 8218
Train dataset
batches [1, 3, 7, 8, 9, 10]
(31553,) (31553, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
OPTN_Untreated: 8200
TBK1_Untreated: 4861
WT_Untreated: 8526
Test dataset
batches [2]
(6048,) (6048, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1231
FUSHomozygous_Untreated: 1162
OPTN_Untreated: 1649
TBK1_Untreated: 1220
WT_Untreated: 786
fit
predict
precision recall f1-score support
0 0.26 0.36 0.30 786
1 0.79 0.72 0.75 2869
2 1.00 0.99 0.99 2393
accuracy 0.78 6048
macro avg 0.68 0.69 0.68 6048
weighted avg 0.81 0.78 0.79 6048
Train dataset
batches [1, 2, 7, 8, 9, 10]
(32731,) (32731, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
OPTN_Untreated: 8746
TBK1_Untreated: 5036
WT_Untreated: 8394
Test dataset
batches [3]
(4870,) (4870, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1004
FUSHomozygous_Untreated: 800
OPTN_Untreated: 1103
TBK1_Untreated: 1045
WT_Untreated: 918
fit
predict
precision recall f1-score support
0 0.26 0.30 0.28 918
1 0.67 0.63 0.65 2148
2 1.00 0.99 1.00 1804
accuracy 0.70 4870
macro avg 0.64 0.64 0.64 4870
weighted avg 0.72 0.70 0.71 4870
Train dataset
batches [1, 2, 3, 8, 9, 10]
(37495,) (37495, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
OPTN_Untreated: 9844
TBK1_Untreated: 6068
WT_Untreated: 9275
Test dataset
batches [7]
(106,) (106, 5568) [0 1 2]
FUSHeterozygous_Untreated: 26
FUSHomozygous_Untreated: 25
OPTN_Untreated: 5
TBK1_Untreated: 13
WT_Untreated: 37
fit
predict
precision recall f1-score support
0 0.00 0.00 0.00 37
1 0.20 0.61 0.30 18
2 0.56 0.55 0.55 51
accuracy 0.37 106
macro avg 0.25 0.39 0.28 106
weighted avg 0.30 0.37 0.32 106
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
Train dataset
batches [1, 2, 3, 7, 9, 10]
(30778,) (30778, 5568) [0 1 2]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
OPTN_Untreated: 8420
TBK1_Untreated: 5326
WT_Untreated: 7802
Test dataset
batches [8]
(6823,) (6823, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1567
FUSHomozygous_Untreated: 1562
OPTN_Untreated: 1429
TBK1_Untreated: 755
WT_Untreated: 1510
fit
predict
precision recall f1-score support
0 0.37 0.36 0.36 1510
1 0.56 0.57 0.56 2184
2 1.00 0.99 1.00 3129
accuracy 0.72 6823
macro avg 0.64 0.64 0.64 6823
weighted avg 0.72 0.72 0.72 6823
Train dataset
batches [1, 2, 3, 7, 8, 10]
(31145,) (31145, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
OPTN_Untreated: 8263
TBK1_Untreated: 5097
WT_Untreated: 7008
Test dataset
batches [9]
(6456,) (6456, 5568) [0 1 2]
FUSHeterozygous_Untreated: 963
FUSHomozygous_Untreated: 619
OPTN_Untreated: 1586
TBK1_Untreated: 984
WT_Untreated: 2304
fit
predict
precision recall f1-score support
0 1.00 0.00 0.00 2304
1 0.25 0.29 0.27 2570
2 0.47 1.00 0.64 1582
accuracy 0.36 6456
macro avg 0.57 0.43 0.30 6456
weighted avg 0.57 0.36 0.26 6456
Train dataset
batches [1, 2, 3, 7, 8, 9]
(32521,) (32521, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
OPTN_Untreated: 8086
TBK1_Untreated: 5893
WT_Untreated: 7116
Test dataset
batches [10]
(5080,) (5080, 5568) [0 1 2]
FUSHeterozygous_Untreated: 267
FUSHomozygous_Untreated: 666
OPTN_Untreated: 1763
TBK1_Untreated: 188
WT_Untreated: 2196
fit
predict
precision recall f1-score support
0 0.51 0.45 0.48 2196
1 0.45 0.51 0.48 1951
2 1.00 0.99 1.00 933
accuracy 0.57 5080
macro avg 0.65 0.65 0.65 5080
weighted avg 0.58 0.57 0.57 5080
0.5743842090556159 [0.520929666585544, 0.7782738095238095, 0.7012320328542094, 0.36792452830188677, 0.7162538472812546, 0.362453531598513, 0.5736220472440945]
## RandomForestClassifier
balance = False
norm = False
choose_features=False
batches = [1,2,3,7,8,9,10]
accuracies = []
accumulated_cm = None
# Label mapping
label_map = {
'WT_Untreated': 0,
'TBK1_Untreated': 1,
'OPTN_Untreated': 1,
'FUSHeterozygous_Untreated': 2,
'FUSHomozygous_Untreated': 2,
}
excluded_labels = {'FUSRevertant_Untreated', 'TDP43_Untreated'}
for test_batch in batches:
test_batches = [test_batch]
train_batches = list(set(batches) - set(test_batches))
X_train, y_train = load_batches(train_batches)
X_test, y_test = load_batches(test_batches)
# Filter out unwanted labels
train_mask = ~np.isin(y_train, list(excluded_labels))
test_mask = ~np.isin(y_test, list(excluded_labels))
X_train, y_train = X_train[train_mask], y_train[train_mask]
X_test, y_test = X_test[test_mask], y_test[test_mask]
# Map labels to 0,1,2
y_train_mapped = np.array([label_map[l] for l in y_train])
y_test_mapped = np.array([label_map[l] for l in y_test])
print('Train dataset')
print('batches', train_batches)
print(np.shape(y_train_mapped), np.shape(X_train), np.unique(y_train_mapped))
count_labels(y_train)
print('Test dataset')
print('batches', test_batches)
print(np.shape(y_test_mapped), np.shape(X_test), np.unique(y_test_mapped))
count_labels(y_test)
# Optional: balance
if balance:
ros = RandomOverSampler(random_state=42)
X_train, y_train_mapped = ros.fit_resample(X_train, y_train_mapped)
# Optional: normalize
if norm:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
if choose_features:
top_features = get_top_features(X_train, y_train_mapped, 100)
X_train = X_train[:, top_features]
X_test = X_test[:, top_features]
# To GPU
X_train = cudf.DataFrame.from_records(X_train)
X_test = cudf.DataFrame.from_records(X_test)
y_train_mapped = cudf.Series(y_train_mapped)
# Train
print('fit')
clf = RandomForestClassifier(n_estimators=50, max_depth=20)
clf.fit(X_train, y_train_mapped)
# Predict
print('predict')
y_pred = clf.predict(X_test).to_numpy()
report = classification_report(y_test_mapped, y_pred, output_dict=True)
print(classification_report(y_test_mapped, y_pred))
accuracy = report['accuracy']
accuracies.append(accuracy)
# Confusion matrix
cm = confusion_matrix(y_test_mapped, y_pred, labels=[0, 1, 2])
if accumulated_cm is None:
accumulated_cm = cm
else:
accumulated_cm += cm
# Final summary
print(np.mean(accuracies), accuracies)
disp = ConfusionMatrixDisplay(confusion_matrix=accumulated_cm, display_labels=["WT", "TBK1/OPTN", "FUS"])
disp.plot(xticks_rotation=0)
plt.title("Combined Confusion Matrix Across Batches")
plt.tight_layout()
plt.show()
Train dataset
batches [2, 3, 7, 8, 9, 10]
(29383,) (29383, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
OPTN_Untreated: 7535
TBK1_Untreated: 4205
WT_Untreated: 7751
Test dataset
batches [1]
(8218,) (8218, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1222
FUSHomozygous_Untreated: 1245
OPTN_Untreated: 2314
TBK1_Untreated: 1876
WT_Untreated: 1561
fit
predict
precision recall f1-score support
0 0.43 0.22 0.29 1561
1 0.75 0.89 0.81 4190
2 1.00 1.00 1.00 2467
accuracy 0.79 8218
macro avg 0.73 0.70 0.70 8218
weighted avg 0.77 0.79 0.77 8218
Train dataset
batches [1, 3, 7, 8, 9, 10]
(31553,) (31553, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
OPTN_Untreated: 8200
TBK1_Untreated: 4861
WT_Untreated: 8526
Test dataset
batches [2]
(6048,) (6048, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1231
FUSHomozygous_Untreated: 1162
OPTN_Untreated: 1649
TBK1_Untreated: 1220
WT_Untreated: 786
fit
predict
precision recall f1-score support
0 0.92 0.13 0.23 786
1 0.80 1.00 0.88 2869
2 1.00 0.98 0.99 2393
accuracy 0.88 6048
macro avg 0.90 0.70 0.70 6048
weighted avg 0.89 0.88 0.84 6048
Train dataset
batches [1, 2, 7, 8, 9, 10]
(32731,) (32731, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
OPTN_Untreated: 8746
TBK1_Untreated: 5036
WT_Untreated: 8394
Test dataset
batches [3]
(4870,) (4870, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1004
FUSHomozygous_Untreated: 800
OPTN_Untreated: 1103
TBK1_Untreated: 1045
WT_Untreated: 918
fit
predict
precision recall f1-score support
0 0.70 0.04 0.08 918
1 0.71 0.99 0.83 2148
2 1.00 1.00 1.00 1804
accuracy 0.81 4870
macro avg 0.80 0.68 0.63 4870
weighted avg 0.81 0.81 0.75 4870
Train dataset
batches [1, 2, 3, 8, 9, 10]
(37495,) (37495, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
OPTN_Untreated: 9844
TBK1_Untreated: 6068
WT_Untreated: 9275
Test dataset
batches [7]
(106,) (106, 5568) [0 1 2]
FUSHeterozygous_Untreated: 26
FUSHomozygous_Untreated: 25
OPTN_Untreated: 5
TBK1_Untreated: 13
WT_Untreated: 37
fit
predict
precision recall f1-score support
0 0.33 0.03 0.05 37
1 0.23 0.78 0.35 18
2 0.41 0.33 0.37 51
accuracy 0.30 106
macro avg 0.32 0.38 0.26 106
weighted avg 0.35 0.30 0.25 106
Train dataset
batches [1, 2, 3, 7, 9, 10]
(30778,) (30778, 5568) [0 1 2]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
OPTN_Untreated: 8420
TBK1_Untreated: 5326
WT_Untreated: 7802
Test dataset
batches [8]
(6823,) (6823, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1567
FUSHomozygous_Untreated: 1562
OPTN_Untreated: 1429
TBK1_Untreated: 755
WT_Untreated: 1510
fit
predict
precision recall f1-score support
0 0.72 0.50 0.59 1510
1 0.70 0.87 0.78 2184
2 1.00 0.99 0.99 3129
accuracy 0.84 6823
macro avg 0.81 0.78 0.79 6823
weighted avg 0.84 0.84 0.84 6823
Train dataset
batches [1, 2, 3, 7, 8, 10]
(31145,) (31145, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
OPTN_Untreated: 8263
TBK1_Untreated: 5097
WT_Untreated: 7008
Test dataset
batches [9]
(6456,) (6456, 5568) [0 1 2]
FUSHeterozygous_Untreated: 963
FUSHomozygous_Untreated: 619
OPTN_Untreated: 1586
TBK1_Untreated: 984
WT_Untreated: 2304
fit
predict
precision recall f1-score support
0 1.00 0.10 0.18 2304
1 0.27 0.30 0.28 2570
2 0.46 1.00 0.63 1582
accuracy 0.40 6456
macro avg 0.58 0.46 0.36 6456
weighted avg 0.58 0.40 0.33 6456
Train dataset
batches [1, 2, 3, 7, 8, 9]
(32521,) (32521, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
OPTN_Untreated: 8086
TBK1_Untreated: 5893
WT_Untreated: 7116
Test dataset
batches [10]
(5080,) (5080, 5568) [0 1 2]
FUSHeterozygous_Untreated: 267
FUSHomozygous_Untreated: 666
OPTN_Untreated: 1763
TBK1_Untreated: 188
WT_Untreated: 2196
fit
predict
precision recall f1-score support
0 0.63 0.99 0.77 2196
1 0.95 0.35 0.51 1951
2 1.00 0.99 1.00 933
accuracy 0.74 5080
macro avg 0.86 0.78 0.76 5080
weighted avg 0.82 0.74 0.71 5080
0.6812646448819758 [0.7942321732781699, 0.8771494708994709, 0.8149897330595482, 0.3018867924528302, 0.8406859152865308, 0.39699504337050806, 0.7429133858267717]
## RandomForestClassifier n_estimators=50
balance = False
norm = False
choose_features=False
batches = [1,2,3,7,8,9,10]
accuracies = []
accumulated_cm = None
# Label mapping
label_map = {
'WT_Untreated': 0,
'TBK1_Untreated': 1,
'OPTN_Untreated': 1,
'FUSHeterozygous_Untreated': 2,
'FUSHomozygous_Untreated': 2,
}
excluded_labels = {'FUSRevertant_Untreated', 'TDP43_Untreated'}
for test_batch in batches:
test_batches = [test_batch]
train_batches = list(set(batches) - set(test_batches))
X_train, y_train = load_batches(train_batches)
X_test, y_test = load_batches(test_batches)
# Filter out unwanted labels
train_mask = ~np.isin(y_train, list(excluded_labels))
test_mask = ~np.isin(y_test, list(excluded_labels))
X_train, y_train = X_train[train_mask], y_train[train_mask]
X_test, y_test = X_test[test_mask], y_test[test_mask]
# Map labels to 0,1,2
y_train_mapped = np.array([label_map[l] for l in y_train])
y_test_mapped = np.array([label_map[l] for l in y_test])
print('Train dataset')
print('batches', train_batches)
print(np.shape(y_train_mapped), np.shape(X_train), np.unique(y_train_mapped))
count_labels(y_train)
print('Test dataset')
print('batches', test_batches)
print(np.shape(y_test_mapped), np.shape(X_test), np.unique(y_test_mapped))
count_labels(y_test)
# Optional: balance
if balance:
ros = RandomOverSampler(random_state=42)
X_train, y_train_mapped = ros.fit_resample(X_train, y_train_mapped)
# Optional: normalize
if norm:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
if choose_features:
top_features = get_top_features(X_train, y_train_mapped, 100)
X_train = X_train[:, top_features]
X_test = X_test[:, top_features]
# To GPU
X_train = cudf.DataFrame.from_records(X_train)
X_test = cudf.DataFrame.from_records(X_test)
y_train_mapped = cudf.Series(y_train_mapped)
# Train
print('fit')
clf = RandomForestClassifier(n_estimators=50, max_depth=20)
clf.fit(X_train, y_train_mapped)
# Predict
print('predict')
y_pred = clf.predict(X_test).to_numpy()
report = classification_report(y_test_mapped, y_pred, output_dict=True)
print(classification_report(y_test_mapped, y_pred))
accuracy = report['accuracy']
accuracies.append(accuracy)
# Confusion matrix
cm = confusion_matrix(y_test_mapped, y_pred, labels=[0, 1, 2])
if accumulated_cm is None:
accumulated_cm = cm
else:
accumulated_cm += cm
# Final summary
print(np.mean(accuracies), accuracies)
disp = ConfusionMatrixDisplay(confusion_matrix=accumulated_cm, display_labels=["WT", "TBK1/OPTN", "FUS"])
disp.plot(xticks_rotation=0)
plt.title("Combined Confusion Matrix Across Batches")
plt.tight_layout()
plt.show()
Train dataset
batches [2, 3, 7, 8, 9, 10]
(29383,) (29383, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
OPTN_Untreated: 7535
TBK1_Untreated: 4205
WT_Untreated: 7751
Test dataset
batches [1]
(8218,) (8218, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1222
FUSHomozygous_Untreated: 1245
OPTN_Untreated: 2314
TBK1_Untreated: 1876
WT_Untreated: 1561
fit
predict
precision recall f1-score support
0 0.43 0.22 0.29 1561
1 0.75 0.89 0.81 4190
2 1.00 1.00 1.00 2467
accuracy 0.79 8218
macro avg 0.73 0.70 0.70 8218
weighted avg 0.77 0.79 0.77 8218
Train dataset
batches [1, 3, 7, 8, 9, 10]
(31553,) (31553, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
OPTN_Untreated: 8200
TBK1_Untreated: 4861
WT_Untreated: 8526
Test dataset
batches [2]
(6048,) (6048, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1231
FUSHomozygous_Untreated: 1162
OPTN_Untreated: 1649
TBK1_Untreated: 1220
WT_Untreated: 786
fit
predict
precision recall f1-score support
0 0.92 0.13 0.23 786
1 0.80 1.00 0.88 2869
2 1.00 0.98 0.99 2393
accuracy 0.88 6048
macro avg 0.90 0.70 0.70 6048
weighted avg 0.89 0.88 0.84 6048
Train dataset
batches [1, 2, 7, 8, 9, 10]
(32731,) (32731, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
OPTN_Untreated: 8746
TBK1_Untreated: 5036
WT_Untreated: 8394
Test dataset
batches [3]
(4870,) (4870, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1004
FUSHomozygous_Untreated: 800
OPTN_Untreated: 1103
TBK1_Untreated: 1045
WT_Untreated: 918
fit
predict
precision recall f1-score support
0 0.70 0.04 0.08 918
1 0.71 0.99 0.83 2148
2 1.00 1.00 1.00 1804
accuracy 0.81 4870
macro avg 0.80 0.68 0.63 4870
weighted avg 0.81 0.81 0.75 4870
Train dataset
batches [1, 2, 3, 8, 9, 10]
(37495,) (37495, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
OPTN_Untreated: 9844
TBK1_Untreated: 6068
WT_Untreated: 9275
Test dataset
batches [7]
(106,) (106, 5568) [0 1 2]
FUSHeterozygous_Untreated: 26
FUSHomozygous_Untreated: 25
OPTN_Untreated: 5
TBK1_Untreated: 13
WT_Untreated: 37
fit
predict
precision recall f1-score support
0 0.33 0.03 0.05 37
1 0.23 0.78 0.35 18
2 0.41 0.33 0.37 51
accuracy 0.30 106
macro avg 0.32 0.38 0.26 106
weighted avg 0.35 0.30 0.25 106
Train dataset
batches [1, 2, 3, 7, 9, 10]
(30778,) (30778, 5568) [0 1 2]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
OPTN_Untreated: 8420
TBK1_Untreated: 5326
WT_Untreated: 7802
Test dataset
batches [8]
(6823,) (6823, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1567
FUSHomozygous_Untreated: 1562
OPTN_Untreated: 1429
TBK1_Untreated: 755
WT_Untreated: 1510
fit
predict
precision recall f1-score support
0 0.72 0.50 0.59 1510
1 0.70 0.87 0.78 2184
2 1.00 0.99 0.99 3129
accuracy 0.84 6823
macro avg 0.81 0.78 0.79 6823
weighted avg 0.84 0.84 0.84 6823
Train dataset
batches [1, 2, 3, 7, 8, 10]
(31145,) (31145, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
OPTN_Untreated: 8263
TBK1_Untreated: 5097
WT_Untreated: 7008
Test dataset
batches [9]
(6456,) (6456, 5568) [0 1 2]
FUSHeterozygous_Untreated: 963
FUSHomozygous_Untreated: 619
OPTN_Untreated: 1586
TBK1_Untreated: 984
WT_Untreated: 2304
fit
predict
precision recall f1-score support
0 1.00 0.10 0.18 2304
1 0.27 0.30 0.28 2570
2 0.46 1.00 0.63 1582
accuracy 0.40 6456
macro avg 0.58 0.46 0.36 6456
weighted avg 0.58 0.40 0.33 6456
Train dataset
batches [1, 2, 3, 7, 8, 9]
(32521,) (32521, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
OPTN_Untreated: 8086
TBK1_Untreated: 5893
WT_Untreated: 7116
Test dataset
batches [10]
(5080,) (5080, 5568) [0 1 2]
FUSHeterozygous_Untreated: 267
FUSHomozygous_Untreated: 666
OPTN_Untreated: 1763
TBK1_Untreated: 188
WT_Untreated: 2196
fit
predict
precision recall f1-score support
0 0.63 0.99 0.77 2196
1 0.95 0.35 0.51 1951
2 1.00 0.99 1.00 933
accuracy 0.74 5080
macro avg 0.86 0.78 0.76 5080
weighted avg 0.82 0.74 0.71 5080
0.6812646448819758 [0.7942321732781699, 0.8771494708994709, 0.8149897330595482, 0.3018867924528302, 0.8406859152865308, 0.39699504337050806, 0.7429133858267717]
## RandomForestClassifier n_estimators=200
balance = False
norm = False
choose_features=False
batches = [1,2,3,7,8,9,10]
accuracies = []
accumulated_cm = None
# Label mapping
label_map = {
'WT_Untreated': 0,
'TBK1_Untreated': 1,
'OPTN_Untreated': 1,
'FUSHeterozygous_Untreated': 2,
'FUSHomozygous_Untreated': 2,
}
excluded_labels = {'FUSRevertant_Untreated', 'TDP43_Untreated'}
for test_batch in batches:
test_batches = [test_batch]
train_batches = list(set(batches) - set(test_batches))
X_train, y_train = load_batches(train_batches)
X_test, y_test = load_batches(test_batches)
# Filter out unwanted labels
train_mask = ~np.isin(y_train, list(excluded_labels))
test_mask = ~np.isin(y_test, list(excluded_labels))
X_train, y_train = X_train[train_mask], y_train[train_mask]
X_test, y_test = X_test[test_mask], y_test[test_mask]
# Map labels to 0,1,2
y_train_mapped = np.array([label_map[l] for l in y_train])
y_test_mapped = np.array([label_map[l] for l in y_test])
print('Train dataset')
print('batches', train_batches)
print(np.shape(y_train_mapped), np.shape(X_train), np.unique(y_train_mapped))
count_labels(y_train)
print('Test dataset')
print('batches', test_batches)
print(np.shape(y_test_mapped), np.shape(X_test), np.unique(y_test_mapped))
count_labels(y_test)
# Optional: balance
if balance:
ros = RandomOverSampler(random_state=42)
X_train, y_train_mapped = ros.fit_resample(X_train, y_train_mapped)
# Optional: normalize
if norm:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
if choose_features:
top_features = get_top_features(X_train, y_train_mapped, 100)
X_train = X_train[:, top_features]
X_test = X_test[:, top_features]
# To GPU
X_train = cudf.DataFrame.from_records(X_train)
X_test = cudf.DataFrame.from_records(X_test)
y_train_mapped = cudf.Series(y_train_mapped)
# Train
print('fit')
clf = RandomForestClassifier(n_estimators=200, max_depth=20)
clf.fit(X_train, y_train_mapped)
# Predict
print('predict')
y_pred = clf.predict(X_test).to_numpy()
report = classification_report(y_test_mapped, y_pred, output_dict=True)
print(classification_report(y_test_mapped, y_pred))
accuracy = report['accuracy']
accuracies.append(accuracy)
# Confusion matrix
cm = confusion_matrix(y_test_mapped, y_pred, labels=[0, 1, 2])
if accumulated_cm is None:
accumulated_cm = cm
else:
accumulated_cm += cm
# Final summary
print(np.mean(accuracies), accuracies)
disp = ConfusionMatrixDisplay(confusion_matrix=accumulated_cm, display_labels=["WT", "TBK1/OPTN", "FUS"])
disp.plot(xticks_rotation=0)
plt.title("Combined Confusion Matrix Across Batches")
plt.tight_layout()
plt.show()
Train dataset
batches [2, 3, 7, 8, 9, 10]
(29383,) (29383, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
OPTN_Untreated: 7535
TBK1_Untreated: 4205
WT_Untreated: 7751
Test dataset
batches [1]
(8218,) (8218, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1222
FUSHomozygous_Untreated: 1245
OPTN_Untreated: 2314
TBK1_Untreated: 1876
WT_Untreated: 1561
fit
predict
precision recall f1-score support
0 0.46 0.19 0.27 1561
1 0.75 0.92 0.83 4190
2 1.00 1.00 1.00 2467
accuracy 0.80 8218
macro avg 0.74 0.70 0.70 8218
weighted avg 0.77 0.80 0.77 8218
Train dataset
batches [1, 3, 7, 8, 9, 10]
(31553,) (31553, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
OPTN_Untreated: 8200
TBK1_Untreated: 4861
WT_Untreated: 8526
Test dataset
batches [2]
(6048,) (6048, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1231
FUSHomozygous_Untreated: 1162
OPTN_Untreated: 1649
TBK1_Untreated: 1220
WT_Untreated: 786
fit
predict
precision recall f1-score support
0 0.95 0.10 0.18 786
1 0.79 1.00 0.88 2869
2 1.00 0.98 0.99 2393
accuracy 0.87 6048
macro avg 0.91 0.69 0.69 6048
weighted avg 0.89 0.87 0.83 6048
Train dataset
batches [1, 2, 7, 8, 9, 10]
(32731,) (32731, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
OPTN_Untreated: 8746
TBK1_Untreated: 5036
WT_Untreated: 8394
Test dataset
batches [3]
(4870,) (4870, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1004
FUSHomozygous_Untreated: 800
OPTN_Untreated: 1103
TBK1_Untreated: 1045
WT_Untreated: 918
fit
predict
precision recall f1-score support
0 0.82 0.02 0.04 918
1 0.70 1.00 0.83 2148
2 1.00 1.00 1.00 1804
accuracy 0.81 4870
macro avg 0.84 0.67 0.62 4870
weighted avg 0.83 0.81 0.74 4870
Train dataset
batches [1, 2, 3, 8, 9, 10]
(37495,) (37495, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
OPTN_Untreated: 9844
TBK1_Untreated: 6068
WT_Untreated: 9275
Test dataset
batches [7]
(106,) (106, 5568) [0 1 2]
FUSHeterozygous_Untreated: 26
FUSHomozygous_Untreated: 25
OPTN_Untreated: 5
TBK1_Untreated: 13
WT_Untreated: 37
fit
predict
precision recall f1-score support
0 0.67 0.05 0.10 37
1 0.26 0.89 0.40 18
2 0.56 0.45 0.50 51
accuracy 0.39 106
macro avg 0.50 0.46 0.33 106
weighted avg 0.55 0.39 0.34 106
Train dataset
batches [1, 2, 3, 7, 9, 10]
(30778,) (30778, 5568) [0 1 2]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
OPTN_Untreated: 8420
TBK1_Untreated: 5326
WT_Untreated: 7802
Test dataset
batches [8]
(6823,) (6823, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1567
FUSHomozygous_Untreated: 1562
OPTN_Untreated: 1429
TBK1_Untreated: 755
WT_Untreated: 1510
fit
predict
precision recall f1-score support
0 0.82 0.48 0.60 1510
1 0.71 0.93 0.80 2184
2 1.00 0.99 0.99 3129
accuracy 0.86 6823
macro avg 0.84 0.80 0.80 6823
weighted avg 0.87 0.86 0.85 6823
Train dataset
batches [1, 2, 3, 7, 8, 10]
(31145,) (31145, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
OPTN_Untreated: 8263
TBK1_Untreated: 5097
WT_Untreated: 7008
Test dataset
batches [9]
(6456,) (6456, 5568) [0 1 2]
FUSHeterozygous_Untreated: 963
FUSHomozygous_Untreated: 619
OPTN_Untreated: 1586
TBK1_Untreated: 984
WT_Untreated: 2304
fit
predict
precision recall f1-score support
0 1.00 0.06 0.12 2304
1 0.26 0.29 0.28 2570
2 0.47 1.00 0.63 1582
accuracy 0.38 6456
macro avg 0.57 0.45 0.34 6456
weighted avg 0.57 0.38 0.31 6456
Train dataset
batches [1, 2, 3, 7, 8, 9]
(32521,) (32521, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
OPTN_Untreated: 8086
TBK1_Untreated: 5893
WT_Untreated: 7116
Test dataset
batches [10]
(5080,) (5080, 5568) [0 1 2]
FUSHeterozygous_Untreated: 267
FUSHomozygous_Untreated: 666
OPTN_Untreated: 1763
TBK1_Untreated: 188
WT_Untreated: 2196
fit
predict
precision recall f1-score support
0 0.63 1.00 0.77 2196
1 0.98 0.33 0.50 1951
2 1.00 0.99 1.00 933
accuracy 0.74 5080
macro avg 0.87 0.77 0.75 5080
weighted avg 0.83 0.74 0.71 5080
0.6941267651147333 [0.802750060842054, 0.874834656084656, 0.8135523613963039, 0.3867924528301887, 0.8556353510186135, 0.3839838909541512, 0.7413385826771653]
## RandomForestClassifier max_depth=10
balance = False
norm = False
choose_features=False
batches = [1,2,3,7,8,9,10]
accuracies = []
accumulated_cm = None
# Label mapping
label_map = {
'WT_Untreated': 0,
'TBK1_Untreated': 1,
'OPTN_Untreated': 1,
'FUSHeterozygous_Untreated': 2,
'FUSHomozygous_Untreated': 2,
}
excluded_labels = {'FUSRevertant_Untreated', 'TDP43_Untreated'}
for test_batch in batches:
test_batches = [test_batch]
train_batches = list(set(batches) - set(test_batches))
X_train, y_train = load_batches(train_batches)
X_test, y_test = load_batches(test_batches)
# Filter out unwanted labels
train_mask = ~np.isin(y_train, list(excluded_labels))
test_mask = ~np.isin(y_test, list(excluded_labels))
X_train, y_train = X_train[train_mask], y_train[train_mask]
X_test, y_test = X_test[test_mask], y_test[test_mask]
# Map labels to 0,1,2
y_train_mapped = np.array([label_map[l] for l in y_train])
y_test_mapped = np.array([label_map[l] for l in y_test])
print('Train dataset')
print('batches', train_batches)
print(np.shape(y_train_mapped), np.shape(X_train), np.unique(y_train_mapped))
count_labels(y_train)
print('Test dataset')
print('batches', test_batches)
print(np.shape(y_test_mapped), np.shape(X_test), np.unique(y_test_mapped))
count_labels(y_test)
# Optional: balance
if balance:
ros = RandomOverSampler(random_state=42)
X_train, y_train_mapped = ros.fit_resample(X_train, y_train_mapped)
# Optional: normalize
if norm:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
if choose_features:
top_features = get_top_features(X_train, y_train_mapped, 100)
X_train = X_train[:, top_features]
X_test = X_test[:, top_features]
# To GPU
X_train = cudf.DataFrame.from_records(X_train)
X_test = cudf.DataFrame.from_records(X_test)
y_train_mapped = cudf.Series(y_train_mapped)
# Train
print('fit')
clf = RandomForestClassifier(n_estimators=100, max_depth=10)
clf.fit(X_train, y_train_mapped)
# Predict
print('predict')
y_pred = clf.predict(X_test).to_numpy()
report = classification_report(y_test_mapped, y_pred, output_dict=True)
print(classification_report(y_test_mapped, y_pred))
accuracy = report['accuracy']
accuracies.append(accuracy)
# Confusion matrix
cm = confusion_matrix(y_test_mapped, y_pred, labels=[0, 1, 2])
if accumulated_cm is None:
accumulated_cm = cm
else:
accumulated_cm += cm
# Final summary
print(np.mean(accuracies), accuracies)
disp = ConfusionMatrixDisplay(confusion_matrix=accumulated_cm, display_labels=["WT", "TBK1/OPTN", "FUS"])
disp.plot(xticks_rotation=0)
plt.title("Combined Confusion Matrix Across Batches")
plt.tight_layout()
plt.show()
Train dataset
batches [2, 3, 7, 8, 9, 10]
(29383,) (29383, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
OPTN_Untreated: 7535
TBK1_Untreated: 4205
WT_Untreated: 7751
Test dataset
batches [1]
(8218,) (8218, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1222
FUSHomozygous_Untreated: 1245
OPTN_Untreated: 2314
TBK1_Untreated: 1876
WT_Untreated: 1561
fit
predict
precision recall f1-score support
0 0.44 0.14 0.22 1561
1 0.74 0.93 0.83 4190
2 1.00 1.00 1.00 2467
accuracy 0.80 8218
macro avg 0.73 0.69 0.68 8218
weighted avg 0.76 0.80 0.76 8218
Train dataset
batches [1, 3, 7, 8, 9, 10]
(31553,) (31553, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
OPTN_Untreated: 8200
TBK1_Untreated: 4861
WT_Untreated: 8526
Test dataset
batches [2]
(6048,) (6048, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1231
FUSHomozygous_Untreated: 1162
OPTN_Untreated: 1649
TBK1_Untreated: 1220
WT_Untreated: 786
fit
predict
precision recall f1-score support
0 1.00 0.04 0.07 786
1 0.78 1.00 0.88 2869
2 1.00 0.98 0.99 2393
accuracy 0.86 6048
macro avg 0.93 0.67 0.64 6048
weighted avg 0.89 0.86 0.82 6048
Train dataset
batches [1, 2, 7, 8, 9, 10]
(32731,) (32731, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
OPTN_Untreated: 8746
TBK1_Untreated: 5036
WT_Untreated: 8394
Test dataset
batches [3]
(4870,) (4870, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1004
FUSHomozygous_Untreated: 800
OPTN_Untreated: 1103
TBK1_Untreated: 1045
WT_Untreated: 918
fit
predict
precision recall f1-score support
0 0.67 0.00 0.01 918
1 0.70 1.00 0.82 2148
2 1.00 1.00 1.00 1804
accuracy 0.81 4870
macro avg 0.79 0.67 0.61 4870
weighted avg 0.80 0.81 0.73 4870
Train dataset
batches [1, 2, 3, 8, 9, 10]
(37495,) (37495, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
OPTN_Untreated: 9844
TBK1_Untreated: 6068
WT_Untreated: 9275
Test dataset
batches [7]
(106,) (106, 5568) [0 1 2]
FUSHeterozygous_Untreated: 26
FUSHomozygous_Untreated: 25
OPTN_Untreated: 5
TBK1_Untreated: 13
WT_Untreated: 37
fit
predict
precision recall f1-score support
0 0.83 0.14 0.23 37
1 0.25 0.94 0.40 18
2 0.55 0.35 0.43 51
accuracy 0.38 106
macro avg 0.54 0.48 0.35 106
weighted avg 0.60 0.38 0.36 106
Train dataset
batches [1, 2, 3, 7, 9, 10]
(30778,) (30778, 5568) [0 1 2]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
OPTN_Untreated: 8420
TBK1_Untreated: 5326
WT_Untreated: 7802
Test dataset
batches [8]
(6823,) (6823, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1567
FUSHomozygous_Untreated: 1562
OPTN_Untreated: 1429
TBK1_Untreated: 755
WT_Untreated: 1510
fit
predict
precision recall f1-score support
0 0.80 0.44 0.57 1510
1 0.70 0.93 0.80 2184
2 1.00 0.99 0.99 3129
accuracy 0.85 6823
macro avg 0.83 0.79 0.79 6823
weighted avg 0.86 0.85 0.84 6823
Train dataset
batches [1, 2, 3, 7, 8, 10]
(31145,) (31145, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
OPTN_Untreated: 8263
TBK1_Untreated: 5097
WT_Untreated: 7008
Test dataset
batches [9]
(6456,) (6456, 5568) [0 1 2]
FUSHeterozygous_Untreated: 963
FUSHomozygous_Untreated: 619
OPTN_Untreated: 1586
TBK1_Untreated: 984
WT_Untreated: 2304
fit
predict
precision recall f1-score support
0 1.00 0.05 0.10 2304
1 0.26 0.30 0.28 2570
2 0.47 1.00 0.64 1582
accuracy 0.38 6456
macro avg 0.58 0.45 0.34 6456
weighted avg 0.58 0.38 0.30 6456
Train dataset
batches [1, 2, 3, 7, 8, 9]
(32521,) (32521, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
OPTN_Untreated: 8086
TBK1_Untreated: 5893
WT_Untreated: 7116
Test dataset
batches [10]
(5080,) (5080, 5568) [0 1 2]
FUSHeterozygous_Untreated: 267
FUSHomozygous_Untreated: 666
OPTN_Untreated: 1763
TBK1_Untreated: 188
WT_Untreated: 2196
fit
predict
precision recall f1-score support
0 0.64 0.99 0.78 2196
1 0.96 0.37 0.53 1951
2 1.00 0.99 1.00 933
accuracy 0.75 5080
macro avg 0.87 0.78 0.77 5080
weighted avg 0.83 0.75 0.72 5080
0.6911670464646049 [0.8016549038695546, 0.8649140211640212, 0.8108829568788501, 0.37735849056603776, 0.8475743807709218, 0.38460346964064435, 0.7511811023622047]
## RandomForestClassifier max_depth=50
balance = False
norm = False
choose_features=False
batches = [1,2,3,7,8,9,10]
accuracies = []
accumulated_cm = None
# Label mapping
label_map = {
'WT_Untreated': 0,
'TBK1_Untreated': 1,
'OPTN_Untreated': 1,
'FUSHeterozygous_Untreated': 2,
'FUSHomozygous_Untreated': 2,
}
excluded_labels = {'FUSRevertant_Untreated', 'TDP43_Untreated'}
for test_batch in batches:
test_batches = [test_batch]
train_batches = list(set(batches) - set(test_batches))
X_train, y_train = load_batches(train_batches)
X_test, y_test = load_batches(test_batches)
# Filter out unwanted labels
train_mask = ~np.isin(y_train, list(excluded_labels))
test_mask = ~np.isin(y_test, list(excluded_labels))
X_train, y_train = X_train[train_mask], y_train[train_mask]
X_test, y_test = X_test[test_mask], y_test[test_mask]
# Map labels to 0,1,2
y_train_mapped = np.array([label_map[l] for l in y_train])
y_test_mapped = np.array([label_map[l] for l in y_test])
print('Train dataset')
print('batches', train_batches)
print(np.shape(y_train_mapped), np.shape(X_train), np.unique(y_train_mapped))
count_labels(y_train)
print('Test dataset')
print('batches', test_batches)
print(np.shape(y_test_mapped), np.shape(X_test), np.unique(y_test_mapped))
count_labels(y_test)
# Optional: balance
if balance:
ros = RandomOverSampler(random_state=42)
X_train, y_train_mapped = ros.fit_resample(X_train, y_train_mapped)
# Optional: normalize
if norm:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
if choose_features:
top_features = get_top_features(X_train, y_train_mapped, 100)
X_train = X_train[:, top_features]
X_test = X_test[:, top_features]
# To GPU
X_train = cudf.DataFrame.from_records(X_train)
X_test = cudf.DataFrame.from_records(X_test)
y_train_mapped = cudf.Series(y_train_mapped)
# Train
print('fit')
clf = RandomForestClassifier(n_estimators=100, max_depth=50)
clf.fit(X_train, y_train_mapped)
# Predict
print('predict')
y_pred = clf.predict(X_test).to_numpy()
report = classification_report(y_test_mapped, y_pred, output_dict=True)
print(classification_report(y_test_mapped, y_pred))
accuracy = report['accuracy']
accuracies.append(accuracy)
# Confusion matrix
cm = confusion_matrix(y_test_mapped, y_pred, labels=[0, 1, 2])
if accumulated_cm is None:
accumulated_cm = cm
else:
accumulated_cm += cm
# Final summary
print(np.mean(accuracies), accuracies)
disp = ConfusionMatrixDisplay(confusion_matrix=accumulated_cm, display_labels=["WT", "TBK1/OPTN", "FUS"])
disp.plot(xticks_rotation=0)
plt.title("Combined Confusion Matrix Across Batches")
plt.tight_layout()
plt.show()
Train dataset
batches [2, 3, 7, 8, 9, 10]
(29383,) (29383, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
OPTN_Untreated: 7535
TBK1_Untreated: 4205
WT_Untreated: 7751
Test dataset
batches [1]
(8218,) (8218, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1222
FUSHomozygous_Untreated: 1245
OPTN_Untreated: 2314
TBK1_Untreated: 1876
WT_Untreated: 1561
fit
predict
precision recall f1-score support
0 0.46 0.22 0.30 1561
1 0.76 0.90 0.82 4190
2 1.00 1.00 1.00 2467
accuracy 0.80 8218
macro avg 0.74 0.71 0.71 8218
weighted avg 0.77 0.80 0.78 8218
Train dataset
batches [1, 3, 7, 8, 9, 10]
(31553,) (31553, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
OPTN_Untreated: 8200
TBK1_Untreated: 4861
WT_Untreated: 8526
Test dataset
batches [2]
(6048,) (6048, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1231
FUSHomozygous_Untreated: 1162
OPTN_Untreated: 1649
TBK1_Untreated: 1220
WT_Untreated: 786
fit
predict
precision recall f1-score support
0 0.91 0.12 0.21 786
1 0.79 1.00 0.88 2869
2 1.00 0.98 0.99 2393
accuracy 0.88 6048
macro avg 0.90 0.70 0.70 6048
weighted avg 0.89 0.88 0.84 6048
Train dataset
batches [1, 2, 7, 8, 9, 10]
(32731,) (32731, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
OPTN_Untreated: 8746
TBK1_Untreated: 5036
WT_Untreated: 8394
Test dataset
batches [3]
(4870,) (4870, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1004
FUSHomozygous_Untreated: 800
OPTN_Untreated: 1103
TBK1_Untreated: 1045
WT_Untreated: 918
fit
predict
precision recall f1-score support
0 0.70 0.03 0.05 918
1 0.70 0.99 0.82 2148
2 1.00 1.00 1.00 1804
accuracy 0.81 4870
macro avg 0.80 0.67 0.62 4870
weighted avg 0.81 0.81 0.74 4870
Train dataset
batches [1, 2, 3, 8, 9, 10]
(37495,) (37495, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
OPTN_Untreated: 9844
TBK1_Untreated: 6068
WT_Untreated: 9275
Test dataset
batches [7]
(106,) (106, 5568) [0 1 2]
FUSHeterozygous_Untreated: 26
FUSHomozygous_Untreated: 25
OPTN_Untreated: 5
TBK1_Untreated: 13
WT_Untreated: 37
fit
predict
precision recall f1-score support
0 0.80 0.11 0.19 37
1 0.25 0.89 0.40 18
2 0.55 0.41 0.47 51
accuracy 0.39 106
macro avg 0.54 0.47 0.35 106
weighted avg 0.59 0.39 0.36 106
Train dataset
batches [1, 2, 3, 7, 9, 10]
(30778,) (30778, 5568) [0 1 2]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
OPTN_Untreated: 8420
TBK1_Untreated: 5326
WT_Untreated: 7802
Test dataset
batches [8]
(6823,) (6823, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1567
FUSHomozygous_Untreated: 1562
OPTN_Untreated: 1429
TBK1_Untreated: 755
WT_Untreated: 1510
fit
predict
precision recall f1-score support
0 0.76 0.51 0.61 1510
1 0.71 0.89 0.79 2184
2 1.00 0.99 0.99 3129
accuracy 0.85 6823
macro avg 0.83 0.79 0.80 6823
weighted avg 0.86 0.85 0.84 6823
Train dataset
batches [1, 2, 3, 7, 8, 10]
(31145,) (31145, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
OPTN_Untreated: 8263
TBK1_Untreated: 5097
WT_Untreated: 7008
Test dataset
batches [9]
(6456,) (6456, 5568) [0 1 2]
FUSHeterozygous_Untreated: 963
FUSHomozygous_Untreated: 619
OPTN_Untreated: 1586
TBK1_Untreated: 984
WT_Untreated: 2304
fit
predict
precision recall f1-score support
0 1.00 0.08 0.15 2304
1 0.27 0.30 0.28 2570
2 0.47 1.00 0.64 1582
accuracy 0.39 6456
macro avg 0.58 0.46 0.36 6456
weighted avg 0.58 0.39 0.32 6456
Train dataset
batches [1, 2, 3, 7, 8, 9]
(32521,) (32521, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
OPTN_Untreated: 8086
TBK1_Untreated: 5893
WT_Untreated: 7116
Test dataset
batches [10]
(5080,) (5080, 5568) [0 1 2]
FUSHeterozygous_Untreated: 267
FUSHomozygous_Untreated: 666
OPTN_Untreated: 1763
TBK1_Untreated: 188
WT_Untreated: 2196
fit
predict
precision recall f1-score support
0 0.62 0.99 0.77 2196
1 0.97 0.33 0.49 1951
2 1.00 0.99 0.99 933
accuracy 0.74 5080
macro avg 0.86 0.77 0.75 5080
weighted avg 0.82 0.74 0.70 5080
0.6940960873564158 [0.8012898515453881, 0.8763227513227513, 0.813141683778234, 0.3867924528301887, 0.8500659533929357, 0.3940520446096654, 0.7370078740157481]
## Baseline
run_baseline_model(
batches=[1, 2, 3, 7, 8, 9, 10],
balance=False,
norm=False,
choose_features=False,
top_k=100,
label_map=None,
classifier_class=cuMLLogisticRegression,
classifier_kwargs={},
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
precision recall f1-score support
0 0.66 0.92 0.77 1222
1 0.87 0.53 0.66 1245
2 0.81 0.91 0.86 1015
3 0.79 0.51 0.62 2314
4 0.12 0.00 0.00 1876
5 0.28 0.31 0.29 1699
6 0.28 0.71 0.40 1561
accuracy 0.50 10932
macro avg 0.54 0.55 0.51 10932
weighted avg 0.52 0.50 0.47 10932
=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4 5 6]
Test: (8356, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
precision recall f1-score support
0 0.63 0.64 0.63 1231
1 0.61 0.58 0.60 1162
2 0.47 0.99 0.64 800
3 0.57 0.65 0.61 1649
4 0.64 0.61 0.62 1220
5 0.38 0.15 0.21 1508
6 0.61 0.55 0.58 786
accuracy 0.57 8356
macro avg 0.56 0.60 0.56 8356
weighted avg 0.56 0.57 0.54 8356
=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4 5 6]
Test: (6931, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
precision recall f1-score support
0 0.74 0.97 0.84 1004
1 0.94 0.57 0.71 800
2 0.70 0.74 0.72 1131
3 0.47 0.40 0.43 1103
4 0.42 0.82 0.56 1045
5 0.49 0.38 0.43 930
6 0.57 0.17 0.26 918
accuracy 0.59 6931
macro avg 0.62 0.58 0.56 6931
weighted avg 0.61 0.59 0.57 6931
=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4 5 6]
Test: (163, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
precision recall f1-score support
0 0.00 0.00 0.00 26
1 0.00 0.00 0.00 25
2 0.00 0.00 0.00 44
3 0.06 0.80 0.12 5
4 0.50 0.08 0.13 13
5 0.07 0.23 0.11 13
6 0.61 0.97 0.75 37
accuracy 0.27 163
macro avg 0.18 0.30 0.16 163
weighted avg 0.19 0.27 0.19 163
=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9550, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
precision recall f1-score support
0 0.81 0.19 0.31 1567
1 0.54 0.95 0.69 1562
2 0.65 0.44 0.53 1163
3 0.43 0.24 0.31 1429
4 0.74 0.14 0.23 755
5 0.29 0.64 0.40 1564
6 0.43 0.36 0.39 1510
accuracy 0.45 9550
macro avg 0.56 0.42 0.41 9550
weighted avg 0.54 0.45 0.42 9550
=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9193, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
precision recall f1-score support
0 0.19 0.27 0.22 963
1 0.28 0.54 0.37 619
2 0.78 0.11 0.19 1298
3 0.18 0.43 0.26 1586
4 0.34 0.23 0.28 984
5 0.10 0.09 0.09 1439
6 0.94 0.26 0.41 2304
accuracy 0.26 9193
macro avg 0.40 0.28 0.26 9193
weighted avg 0.47 0.26 0.27 9193
=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2 3 4 5 6]
Test: (7276, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
precision recall f1-score support
0 0.58 0.21 0.31 267
1 0.75 0.94 0.84 666
2 0.04 0.87 0.08 45
3 0.96 0.03 0.05 1763
4 0.28 0.45 0.35 188
5 0.09 0.01 0.01 2151
6 0.43 0.98 0.60 2196
accuracy 0.42 7276
macro avg 0.45 0.50 0.32 7276
weighted avg 0.49 0.42 0.30 7276
=== Overall Accuracy ===
0.435781782307196 [0.5038419319429198, 0.5652225945428435, 0.5863511758764969, 0.26993865030674846, 0.4496335078534031, 0.25987164146633307, 0.4156129741616273]
run_baseline_model(
batches=[1, 2, 3, 7, 8, 9, 10],
balance=False,
norm=False,
choose_features=False,
top_k=100,
label_map=None,
classifier_class=cuRandomForestClassifier,
classifier_kwargs={"n_estimators": 100, "max_depth": 20},
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
precision recall f1-score support
0 0.53 0.83 0.64 1222
1 0.62 0.27 0.37 1245
2 0.83 0.18 0.30 1015
3 0.37 0.72 0.49 2314
4 0.07 0.00 0.00 1876
5 0.29 0.35 0.32 1699
6 0.11 0.12 0.11 1561
accuracy 0.37 10932
macro avg 0.40 0.35 0.32 10932
weighted avg 0.36 0.37 0.31 10932
=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4 5 6]
Test: (8356, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
precision recall f1-score support
0 0.54 0.86 0.66 1231
1 0.61 0.21 0.31 1162
2 0.31 0.93 0.47 800
3 0.43 0.56 0.49 1649
4 0.48 0.15 0.23 1220
5 0.30 0.10 0.15 1508
6 0.47 0.33 0.39 786
accuracy 0.43 8356
macro avg 0.45 0.45 0.39 8356
weighted avg 0.45 0.43 0.38 8356
=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4 5 6]
Test: (6931, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
precision recall f1-score support
0 0.60 0.96 0.73 1004
1 0.79 0.19 0.30 800
2 0.73 0.18 0.29 1131
3 0.19 0.15 0.17 1103
4 0.38 0.72 0.49 1045
5 0.30 0.40 0.34 930
6 0.19 0.15 0.17 918
accuracy 0.40 6931
macro avg 0.45 0.39 0.36 6931
weighted avg 0.45 0.40 0.36 6931
=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4 5 6]
Test: (163, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
precision recall f1-score support
0 0.15 0.23 0.18 26
1 0.31 0.16 0.21 25
2 0.00 0.00 0.00 44
3 0.04 0.80 0.08 5
4 0.00 0.00 0.00 13
5 0.00 0.00 0.00 13
6 0.67 0.16 0.26 37
accuracy 0.12 163
macro avg 0.17 0.19 0.10 163
weighted avg 0.22 0.12 0.12 163
=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9550, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
precision recall f1-score support
0 0.91 0.07 0.14 1567
1 0.52 0.98 0.68 1562
2 0.17 0.08 0.11 1163
3 0.24 0.21 0.22 1429
4 0.32 0.01 0.02 755
5 0.20 0.39 0.27 1564
6 0.22 0.24 0.23 1510
accuracy 0.32 9550
macro avg 0.37 0.28 0.24 9550
weighted avg 0.38 0.32 0.26 9550
=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9193, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
precision recall f1-score support
0 0.15 0.20 0.17 963
1 0.21 0.86 0.34 619
2 0.00 0.00 0.00 1298
3 0.00 0.00 0.00 1586
4 0.02 0.00 0.00 984
5 0.04 0.04 0.04 1439
6 0.69 0.13 0.22 2304
accuracy 0.12 9193
macro avg 0.16 0.18 0.11 9193
weighted avg 0.21 0.12 0.10 9193
=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2 3 4 5 6]
Test: (7276, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
precision recall f1-score support
0 0.23 0.11 0.15 267
1 0.71 0.85 0.78 666
2 0.04 0.67 0.08 45
3 0.57 0.01 0.02 1763
4 0.21 0.02 0.04 188
5 0.12 0.01 0.02 2151
6 0.39 0.96 0.55 2196
accuracy 0.38 7276
macro avg 0.33 0.38 0.24 7276
weighted avg 0.37 0.38 0.26 7276
=== Overall Accuracy ===
0.30419986925510606 [0.36525795828759605, 0.4269985639061752, 0.3970567017746357, 0.12269938650306748, 0.3169633507853403, 0.11889481126944414, 0.38152831225948325]
run_baseline_model(
batches=[1, 2, 3, 7, 8, 9, 10],
balance=False,
norm=False,
choose_features=False,
top_k=100,
label_map=None,
classifier_class=cuRandomForestClassifier,
classifier_kwargs={"n_estimators": 100, "max_depth": 20},
test_specific_batch=[1]
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
precision recall f1-score support
0 0.53 0.83 0.64 1222
1 0.62 0.27 0.37 1245
2 0.83 0.18 0.30 1015
3 0.37 0.72 0.49 2314
4 0.07 0.00 0.00 1876
5 0.29 0.35 0.32 1699
6 0.11 0.12 0.11 1561
accuracy 0.37 10932
macro avg 0.40 0.35 0.32 10932
weighted avg 0.36 0.37 0.31 10932
=== Overall Accuracy ===
0.36525795828759605 [0.36525795828759605]
run_baseline_model(
batches=[1, 2, 3, 7, 8, 9, 10],
balance=False,
norm=False,
choose_features=False,
top_k=100,
label_map=None,
classifier_class=cuRandomForestClassifier,
classifier_kwargs={"n_estimators": 50, "max_depth": 20},
test_specific_batch=[1]
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
precision recall f1-score support
0 0.52 0.76 0.62 1222
1 0.57 0.30 0.40 1245
2 0.68 0.20 0.31 1015
3 0.38 0.70 0.49 2314
4 0.08 0.00 0.01 1876
5 0.28 0.33 0.30 1699
6 0.14 0.16 0.15 1561
accuracy 0.36 10932
macro avg 0.38 0.35 0.32 10932
weighted avg 0.34 0.36 0.32 10932
=== Overall Accuracy ===
0.35949506037321627 [0.35949506037321627]
run_baseline_model(
batches=[1, 2, 3, 7, 8, 9, 10],
balance=False,
norm=False,
choose_features=False,
top_k=100,
label_map=None,
classifier_class=cuRandomForestClassifier,
classifier_kwargs={"n_estimators": 200, "max_depth": 20},
test_specific_batch=[1]
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
precision recall f1-score support
0 0.54 0.86 0.66 1222
1 0.66 0.26 0.38 1245
2 0.90 0.19 0.32 1015
3 0.38 0.75 0.51 2314
4 0.10 0.00 0.00 1876
5 0.30 0.37 0.33 1699
6 0.09 0.10 0.10 1561
accuracy 0.37 10932
macro avg 0.42 0.36 0.33 10932
weighted avg 0.38 0.37 0.32 10932
=== Overall Accuracy ===
0.3739480424442005 [0.3739480424442005]
run_baseline_model(
batches=[1, 2, 3, 7, 8, 9, 10],
balance=True,
norm=False,
choose_features=False,
top_k=100,
label_map=None,
classifier_class=cuRandomForestClassifier,
classifier_kwargs={"n_estimators": 50, "max_depth": 20},
test_specific_batch=[1]
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
precision recall f1-score support
0 0.53 0.79 0.63 1222
1 0.60 0.30 0.40 1245
2 0.58 0.32 0.42 1015
3 0.37 0.67 0.48 2314
4 0.16 0.01 0.03 1876
5 0.30 0.33 0.31 1699
6 0.13 0.13 0.13 1561
accuracy 0.37 10932
macro avg 0.38 0.37 0.34 10932
weighted avg 0.35 0.37 0.33 10932
=== Overall Accuracy ===
0.36717892425905596 [0.36717892425905596]
run_baseline_model(
batches=[1, 2, 3, 7, 8, 9, 10],
balance=False,
norm=False,
choose_features=False,
top_k=100,
label_map=None,
classifier_class=cuRandomForestClassifier,
classifier_kwargs={"n_estimators": 30, "max_depth": 20},
test_specific_batch=[1]
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
precision recall f1-score support
0 0.51 0.71 0.60 1222
1 0.55 0.33 0.41 1245
2 0.49 0.20 0.28 1015
3 0.36 0.66 0.47 2314
4 0.11 0.01 0.01 1876
5 0.27 0.29 0.28 1699
6 0.15 0.18 0.16 1561
accuracy 0.35 10932
macro avg 0.35 0.34 0.32 10932
weighted avg 0.33 0.35 0.31 10932
=== Overall Accuracy ===
0.34824368825466523 [0.34824368825466523]
run_baseline_model(
batches=[1, 2, 3, 7, 8, 9, 10],
balance=False,
norm=False,
choose_features=False,
top_k=100,
label_map=None,
classifier_class=cuRandomForestClassifier,
classifier_kwargs={"n_estimators": 20, "max_depth": 20},
test_specific_batch=[1]
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
precision recall f1-score support
0 0.51 0.70 0.59 1222
1 0.54 0.35 0.42 1245
2 0.43 0.22 0.29 1015
3 0.36 0.60 0.45 2314
4 0.15 0.02 0.03 1876
5 0.27 0.29 0.28 1699
6 0.16 0.22 0.19 1561
accuracy 0.34 10932
macro avg 0.35 0.34 0.32 10932
weighted avg 0.33 0.34 0.31 10932
=== Overall Accuracy ===
0.34357848518111966 [0.34357848518111966]
run_baseline_model(
batches=[1, 2, 3, 7, 8, 9, 10],
balance=False,
norm=False,
choose_features=False,
top_k=100,
label_map=None,
classifier_class=cuRandomForestClassifier,
classifier_kwargs={"n_estimators": 20, "max_depth": 15},
# test_specific_batch=[1]
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
precision recall f1-score support
0 0.52 0.70 0.59 1222
1 0.55 0.35 0.43 1245
2 0.50 0.19 0.28 1015
3 0.35 0.61 0.45 2314
4 0.13 0.01 0.02 1876
5 0.28 0.30 0.29 1699
6 0.14 0.19 0.16 1561
accuracy 0.34 10932
macro avg 0.35 0.34 0.32 10932
weighted avg 0.33 0.34 0.31 10932
=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4 5 6]
Test: (8356, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
precision recall f1-score support
0 0.53 0.73 0.62 1231
1 0.53 0.31 0.39 1162
2 0.32 0.86 0.46 800
3 0.42 0.49 0.45 1649
4 0.38 0.18 0.24 1220
5 0.35 0.14 0.20 1508
6 0.37 0.34 0.36 786
accuracy 0.41 8356
macro avg 0.42 0.44 0.39 8356
weighted avg 0.42 0.41 0.38 8356
=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4 5 6]
Test: (6931, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
precision recall f1-score support
0 0.60 0.90 0.72 1004
1 0.69 0.27 0.38 800
2 0.48 0.19 0.27 1131
3 0.19 0.16 0.17 1103
4 0.34 0.55 0.42 1045
5 0.27 0.38 0.32 930
6 0.20 0.17 0.18 918
accuracy 0.37 6931
macro avg 0.40 0.37 0.35 6931
weighted avg 0.39 0.37 0.35 6931
=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4 5 6]
Test: (163, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
precision recall f1-score support
0 0.12 0.23 0.16 26
1 0.15 0.24 0.18 25
2 0.00 0.00 0.00 44
3 0.03 0.40 0.06 5
4 0.00 0.00 0.00 13
5 0.00 0.00 0.00 13
6 0.50 0.11 0.18 37
accuracy 0.11 163
macro avg 0.12 0.14 0.08 163
weighted avg 0.16 0.11 0.10 163
=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9550, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
precision recall f1-score support
0 0.70 0.11 0.19 1567
1 0.52 0.93 0.66 1562
2 0.19 0.09 0.13 1163
3 0.23 0.22 0.23 1429
4 0.22 0.03 0.05 755
5 0.22 0.38 0.28 1564
6 0.25 0.29 0.26 1510
accuracy 0.33 9550
macro avg 0.33 0.29 0.26 9550
weighted avg 0.35 0.33 0.28 9550
=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9193, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
precision recall f1-score support
0 0.19 0.26 0.22 963
1 0.20 0.81 0.32 619
2 0.29 0.00 0.01 1298
3 0.01 0.02 0.01 1586
4 0.11 0.04 0.06 984
5 0.12 0.13 0.12 1439
6 0.62 0.19 0.29 2304
accuracy 0.16 9193
macro avg 0.22 0.21 0.15 9193
weighted avg 0.26 0.16 0.15 9193
=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2 3 4 5 6]
Test: (7276, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
precision recall f1-score support
0 0.32 0.22 0.26 267
1 0.73 0.81 0.77 666
2 0.03 0.56 0.06 45
3 0.55 0.03 0.05 1763
4 0.11 0.05 0.07 188
5 0.17 0.04 0.06 2151
6 0.39 0.87 0.54 2196
accuracy 0.37 7276
macro avg 0.33 0.37 0.26 7276
weighted avg 0.38 0.37 0.27 7276
=== Overall Accuracy ===
0.2980805825298372 [0.3403768752286864, 0.41359502154140737, 0.37209637858894823, 0.11042944785276074, 0.3255497382198953, 0.15838137713477646, 0.36613523914238594]
from cuml.svm import SVC as cuSVC
run_baseline_model(
classifier_class=cuSVC,
classifier_kwargs={"kernel": "linear", "C": 1.0, "gamma": "scale"},
norm=True, # important for SVMs
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
precision recall f1-score support
0 0.64 0.89 0.74 1222
1 0.83 0.50 0.63 1245
2 0.83 0.91 0.86 1015
3 0.78 0.54 0.64 2314
4 0.03 0.00 0.00 1876
5 0.26 0.35 0.30 1699
6 0.31 0.69 0.43 1561
accuracy 0.51 10932
macro avg 0.52 0.55 0.52 10932
weighted avg 0.50 0.51 0.48 10932
=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4 5 6]
Test: (8356, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
precision recall f1-score support
0 0.59 0.59 0.59 1231
1 0.57 0.55 0.56 1162
2 0.56 0.99 0.72 800
3 0.49 0.70 0.57 1649
4 0.59 0.62 0.61 1220
5 0.30 0.11 0.16 1508
6 0.78 0.42 0.55 786
accuracy 0.55 8356
macro avg 0.55 0.57 0.54 8356
weighted avg 0.53 0.55 0.52 8356
=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4 5 6]
Test: (6931, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
precision recall f1-score support
0 0.73 0.95 0.83 1004
1 0.91 0.55 0.69 800
2 0.72 0.67 0.70 1131
3 0.43 0.47 0.45 1103
4 0.43 0.76 0.55 1045
5 0.46 0.35 0.40 930
6 0.54 0.18 0.27 918
accuracy 0.57 6931
macro avg 0.60 0.56 0.55 6931
weighted avg 0.59 0.57 0.56 6931
=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4 5 6]
Test: (163, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
precision recall f1-score support
0 0.00 0.00 0.00 26
1 0.00 0.00 0.00 25
2 0.00 0.00 0.00 44
3 0.05 0.60 0.10 5
4 0.50 0.15 0.24 13
5 0.05 0.23 0.08 13
6 0.72 0.78 0.75 37
accuracy 0.23 163
macro avg 0.19 0.25 0.17 163
weighted avg 0.21 0.23 0.20 163
=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9550, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
precision recall f1-score support
0 0.77 0.24 0.36 1567
1 0.55 0.93 0.69 1562
2 0.61 0.57 0.59 1163
3 0.44 0.28 0.34 1429
4 0.72 0.15 0.24 755
5 0.31 0.62 0.41 1564
6 0.47 0.35 0.40 1510
accuracy 0.47 9550
macro avg 0.55 0.45 0.43 9550
weighted avg 0.54 0.47 0.45 9550
=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9193, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
precision recall f1-score support
0 0.25 0.37 0.30 963
1 0.26 0.47 0.33 619
2 0.74 0.07 0.13 1298
3 0.18 0.42 0.25 1586
4 0.31 0.19 0.23 984
5 0.09 0.11 0.10 1439
6 0.95 0.20 0.34 2304
accuracy 0.24 9193
macro avg 0.40 0.26 0.24 9193
weighted avg 0.46 0.24 0.24 9193
=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2 3 4 5 6]
Test: (7276, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
precision recall f1-score support
0 0.44 0.18 0.26 267
1 0.74 0.91 0.81 666
2 0.03 0.82 0.06 45
3 0.96 0.02 0.03 1763
4 0.26 0.57 0.36 188
5 0.10 0.01 0.02 2151
6 0.47 0.97 0.64 2196
accuracy 0.41 7276
macro avg 0.43 0.50 0.31 7276
weighted avg 0.50 0.41 0.30 7276
=== Overall Accuracy ===
0.4245612213582959 [0.508781558726674, 0.5453566299664911, 0.5719232433992208, 0.22699386503067484, 0.4700523560209424, 0.23953007723267702, 0.40929081913139087]
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
FUSHeterozygous_Untreated 0.895746 0.565127 0.940765 0.565037 0.940785
FUSHomozygous_Untreated 0.906166 0.667051 0.937546 0.583621 0.955470
FUSRevertant_Untreated 0.905803 0.594432 0.942288 0.546870 0.951989
OPTN_Untreated 0.777294 0.407656 0.862850 0.407573 0.862891
TBK1_Untreated 0.875957 0.321658 0.948726 0.451628 0.914188
TDP43_Untreated 0.741226 0.239359 0.849572 0.255683 0.838022
WT_Untreated 0.805767 0.506873 0.870361 0.457986 0.890913
Macro Average 0.843994 0.471737 0.907444 0.466914 0.907751
from cuml.svm import SVC as cuSVC
run_baseline_model(
classifier_class=cuSVC,
classifier_kwargs={"kernel": "linear", "C": 1.0},
norm=True, # important for SVMs
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
precision recall f1-score support
0 0.64 0.89 0.74 1222
1 0.83 0.50 0.63 1245
2 0.83 0.91 0.86 1015
3 0.78 0.54 0.64 2314
4 0.03 0.00 0.00 1876
5 0.26 0.35 0.30 1699
6 0.31 0.69 0.43 1561
accuracy 0.51 10932
macro avg 0.52 0.55 0.52 10932
weighted avg 0.50 0.51 0.48 10932
=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4 5 6]
Test: (8356, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
precision recall f1-score support
0 0.59 0.59 0.59 1231
1 0.57 0.55 0.56 1162
2 0.56 0.99 0.72 800
3 0.49 0.70 0.57 1649
4 0.59 0.62 0.61 1220
5 0.30 0.11 0.16 1508
6 0.78 0.42 0.55 786
accuracy 0.55 8356
macro avg 0.55 0.57 0.54 8356
weighted avg 0.53 0.55 0.52 8356
=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4 5 6]
Test: (6931, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
precision recall f1-score support
0 0.73 0.95 0.83 1004
1 0.91 0.55 0.69 800
2 0.72 0.67 0.70 1131
3 0.43 0.47 0.45 1103
4 0.43 0.76 0.55 1045
5 0.46 0.35 0.40 930
6 0.54 0.18 0.27 918
accuracy 0.57 6931
macro avg 0.60 0.56 0.55 6931
weighted avg 0.59 0.57 0.56 6931
=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4 5 6]
Test: (163, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
precision recall f1-score support
0 0.00 0.00 0.00 26
1 0.00 0.00 0.00 25
2 0.00 0.00 0.00 44
3 0.05 0.60 0.10 5
4 0.50 0.15 0.24 13
5 0.05 0.23 0.08 13
6 0.72 0.78 0.75 37
accuracy 0.23 163
macro avg 0.19 0.25 0.17 163
weighted avg 0.21 0.23 0.20 163
=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9550, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
precision recall f1-score support
0 0.77 0.24 0.36 1567
1 0.55 0.93 0.69 1562
2 0.61 0.57 0.59 1163
3 0.44 0.28 0.34 1429
4 0.72 0.15 0.24 755
5 0.31 0.62 0.41 1564
6 0.47 0.35 0.40 1510
accuracy 0.47 9550
macro avg 0.55 0.45 0.43 9550
weighted avg 0.54 0.47 0.45 9550
=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9193, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
precision recall f1-score support
0 0.25 0.37 0.30 963
1 0.26 0.47 0.33 619
2 0.74 0.07 0.13 1298
3 0.18 0.42 0.25 1586
4 0.31 0.19 0.23 984
5 0.09 0.11 0.10 1439
6 0.95 0.20 0.34 2304
accuracy 0.24 9193
macro avg 0.40 0.26 0.24 9193
weighted avg 0.46 0.24 0.24 9193
=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2 3 4 5 6]
Test: (7276, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
precision recall f1-score support
0 0.44 0.18 0.26 267
1 0.74 0.91 0.81 666
2 0.03 0.82 0.06 45
3 0.96 0.02 0.03 1763
4 0.26 0.57 0.36 188
5 0.10 0.01 0.02 2151
6 0.47 0.97 0.64 2196
accuracy 0.41 7276
macro avg 0.43 0.50 0.31 7276
weighted avg 0.50 0.41 0.30 7276
=== Overall Accuracy ===
0.4245612213582959 [0.508781558726674, 0.5453566299664911, 0.5719232433992208, 0.22699386503067484, 0.4700523560209424, 0.23953007723267702, 0.40929081913139087]
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
FUSHeterozygous_Untreated 0.895746 0.565127 0.940765 0.565037 0.940785
FUSHomozygous_Untreated 0.906166 0.667051 0.937546 0.583621 0.955470
FUSRevertant_Untreated 0.905803 0.594432 0.942288 0.546870 0.951989
OPTN_Untreated 0.777294 0.407656 0.862850 0.407573 0.862891
TBK1_Untreated 0.875957 0.321658 0.948726 0.451628 0.914188
TDP43_Untreated 0.741226 0.239359 0.849572 0.255683 0.838022
WT_Untreated 0.805767 0.506873 0.870361 0.457986 0.890913
Macro Average 0.843994 0.471737 0.907444 0.466914 0.907751
from cuml.svm import SVC as cuSVC
run_baseline_model(
classifier_class=cuSVC,
classifier_kwargs={"kernel": "rbf", "C": 1.0, "gamma": "scale"},
norm=True
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
precision recall f1-score support
0 0.65 0.89 0.75 1222
1 0.83 0.52 0.64 1245
2 0.89 0.85 0.87 1015
3 0.71 0.63 0.67 2314
4 0.00 0.00 0.00 1876
5 0.30 0.55 0.39 1699
6 0.40 0.58 0.47 1561
accuracy 0.54 10932
macro avg 0.54 0.58 0.54 10932
weighted avg 0.50 0.54 0.51 10932
=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4 5 6]
Test: (8356, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
precision recall f1-score support
0 0.60 0.79 0.68 1231
1 0.68 0.44 0.53 1162
2 0.40 1.00 0.57 800
3 0.52 0.74 0.61 1649
4 0.66 0.45 0.54 1220
5 0.27 0.05 0.08 1508
6 0.76 0.55 0.64 786
accuracy 0.55 8356
macro avg 0.56 0.57 0.52 8356
weighted avg 0.54 0.55 0.50 8356
=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4 5 6]
Test: (6931, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
precision recall f1-score support
0 0.70 0.98 0.82 1004
1 0.95 0.47 0.63 800
2 0.79 0.69 0.74 1131
3 0.41 0.38 0.40 1103
4 0.40 0.87 0.55 1045
5 0.54 0.25 0.35 930
6 0.55 0.25 0.35 918
accuracy 0.57 6931
macro avg 0.62 0.56 0.55 6931
weighted avg 0.61 0.57 0.55 6931
=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4 5 6]
Test: (163, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
precision recall f1-score support
0 0.00 0.00 0.00 26
1 0.00 0.00 0.00 25
2 0.00 0.00 0.00 44
3 0.05 1.00 0.10 5
4 0.00 0.00 0.00 13
5 0.03 0.08 0.04 13
6 0.84 0.84 0.84 37
accuracy 0.23 163
macro avg 0.13 0.27 0.14 163
weighted avg 0.19 0.23 0.20 163
=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9550, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
precision recall f1-score support
0 0.87 0.15 0.26 1567
1 0.53 0.97 0.69 1562
2 0.63 0.33 0.43 1163
3 0.44 0.41 0.42 1429
4 0.86 0.11 0.19 755
5 0.28 0.69 0.39 1564
6 0.41 0.13 0.20 1510
accuracy 0.43 9550
macro avg 0.57 0.40 0.37 9550
weighted avg 0.55 0.43 0.38 9550
=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9193, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
precision recall f1-score support
0 0.19 0.24 0.21 963
1 0.29 0.62 0.40 619
2 0.88 0.04 0.08 1298
3 0.19 0.38 0.25 1586
4 0.40 0.16 0.23 984
5 0.09 0.13 0.11 1439
6 0.99 0.42 0.59 2304
accuracy 0.28 9193
macro avg 0.43 0.28 0.27 9193
weighted avg 0.50 0.28 0.29 9193
=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2 3 4 5 6]
Test: (7276, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
precision recall f1-score support
0 0.06 0.01 0.01 267
1 0.71 0.95 0.81 666
2 0.02 0.56 0.03 45
3 0.97 0.02 0.04 1763
4 0.26 0.45 0.33 188
5 0.05 0.01 0.01 2151
6 0.51 0.97 0.67 2196
accuracy 0.40 7276
macro avg 0.37 0.42 0.27 7276
weighted avg 0.48 0.40 0.30 7276
=== Overall Accuracy ===
0.4280018711276649 [0.5405232345407976, 0.5460746768788894, 0.569470494878084, 0.22699386503067484, 0.42879581151832463, 0.2817361035570543, 0.40241891148982956]
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
FUSHeterozygous_Untreated 0.895097 0.561146 0.940569 0.562490 0.940264
FUSHomozygous_Untreated 0.906376 0.670012 0.937395 0.584110 0.955842
FUSRevertant_Untreated 0.887655 0.529294 0.929645 0.468513 0.943995
OPTN_Untreated 0.785348 0.441974 0.864824 0.430777 0.870059
TBK1_Untreated 0.876930 0.293702 0.953497 0.453299 0.911372
TDP43_Untreated 0.728383 0.269454 0.827459 0.252137 0.839912
WT_Untreated 0.838858 0.528028 0.906032 0.548405 0.898814
Macro Average 0.845521 0.470516 0.908489 0.471390 0.908608
from cuml.neighbors import KNeighborsClassifier as cuKNNClassifier
run_baseline_model(
classifier_class=cuKNNClassifier,
classifier_kwargs={"n_neighbors": 5},
norm=True
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
precision recall f1-score support
0 0.39 0.47 0.43 1222
1 0.50 0.75 0.60 1245
2 0.76 0.25 0.37 1015
3 0.42 0.43 0.43 2314
4 0.17 0.05 0.07 1876
5 0.27 0.66 0.38 1699
6 0.52 0.06 0.11 1561
accuracy 0.37 10932
macro avg 0.43 0.38 0.34 10932
weighted avg 0.40 0.37 0.33 10932
=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4 5 6]
Test: (8356, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
precision recall f1-score support
0 0.57 0.41 0.48 1231
1 0.53 0.72 0.61 1162
2 0.35 0.86 0.49 800
3 0.44 0.51 0.47 1649
4 0.33 0.31 0.32 1220
5 0.27 0.12 0.17 1508
6 0.53 0.11 0.18 786
accuracy 0.42 8356
macro avg 0.43 0.44 0.39 8356
weighted avg 0.42 0.42 0.39 8356
=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4 5 6]
Test: (6931, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
precision recall f1-score support
0 0.46 0.78 0.58 1004
1 0.45 0.27 0.33 800
2 0.66 0.10 0.18 1131
3 0.31 0.30 0.31 1103
4 0.29 0.82 0.43 1045
5 0.36 0.11 0.17 930
6 0.41 0.10 0.16 918
accuracy 0.36 6931
macro avg 0.42 0.36 0.31 6931
weighted avg 0.42 0.36 0.31 6931
=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4 5 6]
Test: (163, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
precision recall f1-score support
0 0.69 0.35 0.46 26
1 0.00 0.00 0.00 25
2 0.00 0.00 0.00 44
3 0.04 1.00 0.08 5
4 0.00 0.00 0.00 13
5 0.00 0.00 0.00 13
6 0.81 0.70 0.75 37
accuracy 0.25 163
macro avg 0.22 0.29 0.19 163
weighted avg 0.30 0.25 0.25 163
=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9550, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
precision recall f1-score support
0 0.61 0.38 0.47 1567
1 0.54 0.73 0.62 1562
2 0.21 0.14 0.17 1163
3 0.25 0.32 0.28 1429
4 0.13 0.11 0.12 755
5 0.24 0.38 0.29 1564
6 0.39 0.19 0.26 1510
accuracy 0.35 9550
macro avg 0.34 0.32 0.31 9550
weighted avg 0.36 0.35 0.34 9550
=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9193, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
precision recall f1-score support
0 0.25 0.34 0.29 963
1 0.22 0.77 0.35 619
2 0.36 0.05 0.09 1298
3 0.24 0.21 0.23 1586
4 0.10 0.01 0.02 984
5 0.15 0.23 0.18 1439
6 0.68 0.54 0.60 2304
accuracy 0.31 9193
macro avg 0.29 0.31 0.25 9193
weighted avg 0.34 0.31 0.29 9193
=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2 3 4 5 6]
Test: (7276, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
precision recall f1-score support
0 0.05 0.09 0.06 267
1 0.45 0.39 0.42 666
2 0.01 0.47 0.02 45
3 0.69 0.03 0.06 1763
4 0.14 0.06 0.09 188
5 0.19 0.03 0.05 2151
6 0.54 0.89 0.67 2196
accuracy 0.33 7276
macro avg 0.29 0.28 0.19 7276
weighted avg 0.43 0.33 0.27 7276
=== Overall Accuracy ===
0.33995055533638296 [0.37193560190267105, 0.42161321206318814, 0.3612754292309912, 0.24539877300613497, 0.34701570680628274, 0.305449798759926, 0.32696536558548656]
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
FUSHeterozygous_Untreated 0.855690 0.450796 0.910822 0.407690 0.924126
FUSHomozygous_Untreated 0.864449 0.634973 0.894564 0.441446 0.949172
FUSRevertant_Untreated 0.839125 0.238173 0.909541 0.235771 0.910628
OPTN_Untreated 0.761092 0.306630 0.866281 0.346728 0.843698
TBK1_Untreated 0.833973 0.235981 0.912478 0.261432 0.900964
TDP43_Untreated 0.719528 0.256664 0.819454 0.234831 0.836238
WT_Untreated 0.837350 0.407109 0.930330 0.558074 0.878947
Macro Average 0.815887 0.361475 0.891924 0.355139 0.891967
from cuml.neighbors import KNeighborsClassifier as cuKNNClassifier
run_baseline_model(
classifier_class=cuKNNClassifier,
classifier_kwargs={"n_neighbors": 50},
norm=True
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
precision recall f1-score support
0 0.47 0.39 0.43 1222
1 0.57 0.90 0.70 1245
2 0.96 0.17 0.30 1015
3 0.52 0.45 0.48 2314
4 0.05 0.01 0.02 1876
5 0.27 0.83 0.41 1699
6 0.61 0.02 0.04 1561
accuracy 0.39 10932
macro avg 0.49 0.40 0.34 10932
weighted avg 0.46 0.39 0.33 10932
=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4 5 6]
Test: (8356, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
precision recall f1-score support
0 0.67 0.36 0.46 1231
1 0.57 0.80 0.67 1162
2 0.41 0.94 0.57 800
3 0.52 0.64 0.57 1649
4 0.44 0.44 0.44 1220
5 0.30 0.17 0.21 1508
6 0.72 0.11 0.19 786
accuracy 0.48 8356
macro avg 0.52 0.49 0.44 8356
weighted avg 0.50 0.48 0.45 8356
=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4 5 6]
Test: (6931, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
precision recall f1-score support
0 0.56 0.86 0.68 1004
1 0.62 0.30 0.40 800
2 0.88 0.05 0.10 1131
3 0.30 0.23 0.26 1103
4 0.26 0.96 0.41 1045
5 0.42 0.06 0.10 930
6 0.56 0.06 0.12 918
accuracy 0.36 6931
macro avg 0.51 0.36 0.29 6931
weighted avg 0.51 0.36 0.29 6931
=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4 5 6]
Test: (163, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
precision recall f1-score support
0 0.70 0.27 0.39 26
1 0.00 0.00 0.00 25
2 0.00 0.00 0.00 44
3 0.05 1.00 0.09 5
4 0.00 0.00 0.00 13
5 0.00 0.00 0.00 13
6 0.78 0.84 0.81 37
accuracy 0.26 163
macro avg 0.22 0.30 0.18 163
weighted avg 0.29 0.26 0.25 163
=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9550, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
precision recall f1-score support
0 0.74 0.23 0.35 1567
1 0.54 0.86 0.67 1562
2 0.30 0.04 0.07 1163
3 0.30 0.28 0.29 1429
4 0.16 0.09 0.12 755
5 0.26 0.60 0.36 1564
6 0.45 0.29 0.35 1510
accuracy 0.38 9550
macro avg 0.39 0.34 0.32 9550
weighted avg 0.42 0.38 0.34 9550
=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9193, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
precision recall f1-score support
0 0.31 0.28 0.29 963
1 0.27 0.86 0.41 619
2 0.38 0.01 0.02 1298
3 0.26 0.23 0.25 1586
4 0.06 0.00 0.01 984
5 0.19 0.34 0.24 1439
6 0.73 0.75 0.74 2304
accuracy 0.37 9193
macro avg 0.31 0.35 0.28 9193
weighted avg 0.37 0.37 0.33 9193
=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2 3 4 5 6]
Test: (7276, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
precision recall f1-score support
0 0.01 0.02 0.02 267
1 0.50 0.39 0.44 666
2 0.02 0.67 0.04 45
3 0.88 0.01 0.02 1763
4 0.09 0.02 0.03 188
5 0.08 0.01 0.01 2151
6 0.46 0.99 0.63 2196
accuracy 0.34 7276
macro avg 0.29 0.30 0.17 7276
weighted avg 0.43 0.34 0.24 7276
=== Overall Accuracy ===
0.37046544679203297 [0.39096231247713137, 0.48468166586883676, 0.3648824123503102, 0.26380368098159507, 0.3769633507853403, 0.3691939519199391, 0.3427707531610775]
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
FUSHeterozygous_Untreated 0.876987 0.386146 0.943822 0.483453 0.918645
FUSHomozygous_Untreated 0.882254 0.726929 0.902638 0.494904 0.961815
FUSRevertant_Untreated 0.866968 0.195051 0.945699 0.296214 0.909311
OPTN_Untreated 0.783897 0.318611 0.891591 0.404851 0.849698
TBK1_Untreated 0.829278 0.269692 0.902742 0.266884 0.903991
TDP43_Untreated 0.702658 0.339209 0.781122 0.250695 0.845574
WT_Untreated 0.835957 0.486899 0.911393 0.542864 0.891530
Macro Average 0.825429 0.388934 0.897001 0.391409 0.897223
from xgboost import XGBClassifier
run_baseline_model(
classifier_class=XGBClassifier,
classifier_kwargs={
"tree_method": "gpu_hist",
"predictor": "gpu_predictor",
"n_estimators": 100,
"max_depth": 6,
"use_label_encoder": False,
"eval_metric": "mlogloss"
},
norm=False
)
=== Batch 1 === Train: (41469, 5568) Labels: [0 1 2 3 4 5 6] Test: (10932, 5568) Labels: [0 1 2 3 4 5 6] FUSHeterozygous_Untreated: 5058 FUSHomozygous_Untreated: 4834 FUSRevertant_Untreated: 4481 OPTN_Untreated: 7535 TBK1_Untreated: 4205 TDP43_Untreated: 7605 WT_Untreated: 7751
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/xgboost/core.py:158: UserWarning: [12:43:55] WARNING: /workspace/src/common/error_msg.cc:27: The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` parameter to CUDA instead.
E.g. tree_method = "hist", device = "cuda"
warnings.warn(smsg, UserWarning)
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/xgboost/core.py:158: UserWarning: [12:43:55] WARNING: /workspace/src/learner.cc:740:
Parameters: { "predictor", "use_label_encoder" } are not used.
warnings.warn(smsg, UserWarning)
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/xgboost/core.py:158: UserWarning: [12:44:58] WARNING: /workspace/src/common/error_msg.cc:27: The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` parameter to CUDA instead.
E.g. tree_method = "hist", device = "cuda"
warnings.warn(smsg, UserWarning)
precision recall f1-score support
0 0.58 0.87 0.70 1222
1 0.76 0.38 0.51 1245
2 0.82 0.62 0.71 1015
3 0.55 0.64 0.59 2314
4 0.06 0.00 0.01 1876
5 0.31 0.39 0.35 1699
6 0.26 0.46 0.33 1561
accuracy 0.46 10932
macro avg 0.48 0.48 0.46 10932
weighted avg 0.44 0.46 0.43 10932
=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4 5 6]
Test: (8356, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/xgboost/core.py:158: UserWarning: [12:45:37] WARNING: /workspace/src/common/error_msg.cc:27: The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` parameter to CUDA instead.
E.g. tree_method = "hist", device = "cuda"
warnings.warn(smsg, UserWarning)
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/xgboost/core.py:158: UserWarning: [12:45:37] WARNING: /workspace/src/learner.cc:740:
Parameters: { "predictor", "use_label_encoder" } are not used.
warnings.warn(smsg, UserWarning)
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/xgboost/core.py:158: UserWarning: [12:46:31] WARNING: /workspace/src/common/error_msg.cc:27: The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` parameter to CUDA instead.
E.g. tree_method = "hist", device = "cuda"
warnings.warn(smsg, UserWarning)
precision recall f1-score support
0 0.57 0.73 0.64 1231
1 0.60 0.41 0.49 1162
2 0.36 0.98 0.53 800
3 0.49 0.58 0.53 1649
4 0.60 0.32 0.42 1220
5 0.30 0.09 0.14 1508
6 0.57 0.55 0.56 786
accuracy 0.49 8356
macro avg 0.50 0.52 0.47 8356
weighted avg 0.49 0.49 0.46 8356
=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4 5 6]
Test: (6931, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/xgboost/core.py:158: UserWarning: [12:46:46] WARNING: /workspace/src/common/error_msg.cc:27: The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` parameter to CUDA instead.
E.g. tree_method = "hist", device = "cuda"
warnings.warn(smsg, UserWarning)
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/xgboost/core.py:158: UserWarning: [12:46:46] WARNING: /workspace/src/learner.cc:740:
Parameters: { "predictor", "use_label_encoder" } are not used.
warnings.warn(smsg, UserWarning)
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/xgboost/core.py:158: UserWarning: [12:47:41] WARNING: /workspace/src/common/error_msg.cc:27: The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` parameter to CUDA instead.
E.g. tree_method = "hist", device = "cuda"
warnings.warn(smsg, UserWarning)
precision recall f1-score support
0 0.63 0.94 0.75 1004
1 0.83 0.29 0.43 800
2 0.72 0.34 0.46 1131
3 0.33 0.32 0.32 1103
4 0.39 0.87 0.53 1045
5 0.39 0.28 0.33 930
6 0.37 0.20 0.26 918
accuracy 0.47 6931
macro avg 0.52 0.46 0.44 6931
weighted avg 0.51 0.47 0.44 6931
=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4 5 6]
Test: (163, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/xgboost/core.py:158: UserWarning: [12:47:58] WARNING: /workspace/src/common/error_msg.cc:27: The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` parameter to CUDA instead.
E.g. tree_method = "hist", device = "cuda"
warnings.warn(smsg, UserWarning)
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/xgboost/core.py:158: UserWarning: [12:47:58] WARNING: /workspace/src/learner.cc:740:
Parameters: { "predictor", "use_label_encoder" } are not used.
warnings.warn(smsg, UserWarning)
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/xgboost/core.py:158: UserWarning: [12:48:57] WARNING: /workspace/src/common/error_msg.cc:27: The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` parameter to CUDA instead.
E.g. tree_method = "hist", device = "cuda"
warnings.warn(smsg, UserWarning)
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, msg_start, len(result))
precision recall f1-score support
0 0.30 0.12 0.17 26
1 0.67 0.08 0.14 25
2 0.00 0.00 0.00 44
3 0.04 0.40 0.08 5
4 0.25 0.15 0.19 13
5 0.11 0.46 0.18 13
6 0.72 0.78 0.75 37
accuracy 0.27 163
macro avg 0.30 0.28 0.22 163
weighted avg 0.34 0.27 0.25 163
=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9550, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/xgboost/core.py:158: UserWarning: [12:49:13] WARNING: /workspace/src/common/error_msg.cc:27: The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` parameter to CUDA instead.
E.g. tree_method = "hist", device = "cuda"
warnings.warn(smsg, UserWarning)
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/xgboost/core.py:158: UserWarning: [12:49:13] WARNING: /workspace/src/learner.cc:740:
Parameters: { "predictor", "use_label_encoder" } are not used.
warnings.warn(smsg, UserWarning)
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/xgboost/core.py:158: UserWarning: [12:50:07] WARNING: /workspace/src/common/error_msg.cc:27: The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` parameter to CUDA instead.
E.g. tree_method = "hist", device = "cuda"
warnings.warn(smsg, UserWarning)
precision recall f1-score support
0 0.81 0.10 0.18 1567
1 0.52 0.97 0.68 1562
2 0.38 0.16 0.23 1163
3 0.33 0.26 0.29 1429
4 0.59 0.09 0.15 755
5 0.25 0.61 0.36 1564
6 0.30 0.19 0.24 1510
accuracy 0.37 9550
macro avg 0.46 0.34 0.30 9550
weighted avg 0.45 0.37 0.32 9550
=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9193, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/xgboost/core.py:158: UserWarning: [12:50:42] WARNING: /workspace/src/common/error_msg.cc:27: The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` parameter to CUDA instead.
E.g. tree_method = "hist", device = "cuda"
warnings.warn(smsg, UserWarning)
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/xgboost/core.py:158: UserWarning: [12:50:42] WARNING: /workspace/src/learner.cc:740:
Parameters: { "predictor", "use_label_encoder" } are not used.
warnings.warn(smsg, UserWarning)
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/xgboost/core.py:158: UserWarning: [12:51:42] WARNING: /workspace/src/common/error_msg.cc:27: The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` parameter to CUDA instead.
E.g. tree_method = "hist", device = "cuda"
warnings.warn(smsg, UserWarning)
precision recall f1-score support
0 0.12 0.19 0.14 963
1 0.24 0.76 0.36 619
2 0.93 0.03 0.06 1298
3 0.05 0.09 0.06 1586
4 0.12 0.04 0.06 984
5 0.03 0.04 0.03 1439
6 0.90 0.20 0.32 2304
accuracy 0.15 9193
macro avg 0.34 0.19 0.15 9193
weighted avg 0.41 0.15 0.15 9193
=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2 3 4 5 6]
Test: (7276, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/xgboost/core.py:158: UserWarning: [12:52:27] WARNING: /workspace/src/common/error_msg.cc:27: The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` parameter to CUDA instead.
E.g. tree_method = "hist", device = "cuda"
warnings.warn(smsg, UserWarning)
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/xgboost/core.py:158: UserWarning: [12:52:27] WARNING: /workspace/src/learner.cc:740:
Parameters: { "predictor", "use_label_encoder" } are not used.
warnings.warn(smsg, UserWarning)
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/xgboost/core.py:158: UserWarning: [12:53:28] WARNING: /workspace/src/common/error_msg.cc:27: The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` parameter to CUDA instead.
E.g. tree_method = "hist", device = "cuda"
warnings.warn(smsg, UserWarning)
precision recall f1-score support
0 0.25 0.07 0.11 267
1 0.71 0.92 0.80 666
2 0.03 0.80 0.06 45
3 0.96 0.01 0.03 1763
4 0.25 0.24 0.25 188
5 0.05 0.01 0.01 2151
6 0.44 0.95 0.60 2196
accuracy 0.39 7276
macro avg 0.38 0.43 0.26 7276
weighted avg 0.46 0.39 0.27 7276
=== Overall Accuracy ===
0.3713374682263305 [0.46084888401024515, 0.48851124940162755, 0.47150483335737986, 0.26993865030674846, 0.37026178010471206, 0.1498966605025563, 0.3884002199010445]
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
FUSHeterozygous_Untreated 0.876930 0.521338 0.925349 0.487420 0.934200
FUSHomozygous_Untreated 0.885785 0.621155 0.920513 0.506302 0.948757
FUSRevertant_Untreated 0.875728 0.377365 0.934122 0.401627 0.927557
OPTN_Untreated 0.748841 0.337395 0.844073 0.333702 0.846241
TBK1_Untreated 0.868819 0.239270 0.951468 0.392925 0.905006
TDP43_Untreated 0.729032 0.224312 0.837993 0.230125 0.833449
WT_Untreated 0.784775 0.449205 0.857295 0.404859 0.878081
Macro Average 0.824273 0.395720 0.895831 0.393851 0.896184
from sklearn.neural_network import MLPClassifier
run_baseline_model(
classifier_class=MLPClassifier,
classifier_kwargs={
"hidden_layer_sizes": (100,), # You can adjust e.g., (256, 128)
"activation": "relu",
"solver": "adam",
"max_iter": 200,
"random_state": 42
},
norm=True
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
precision recall f1-score support
0 0.50 0.90 0.64 1222
1 0.72 0.32 0.44 1245
2 0.86 0.88 0.87 1015
3 0.72 0.63 0.67 2314
4 0.09 0.00 0.00 1876
5 0.31 0.38 0.34 1699
6 0.38 0.73 0.50 1561
accuracy 0.52 10932
macro avg 0.51 0.55 0.50 10932
weighted avg 0.49 0.52 0.47 10932
=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4 5 6]
Test: (8356, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
precision recall f1-score support
0 0.54 0.77 0.63 1231
1 0.67 0.31 0.42 1162
2 0.28 1.00 0.44 800
3 0.60 0.56 0.58 1649
4 0.72 0.29 0.41 1220
5 0.25 0.06 0.09 1508
6 0.60 0.64 0.62 786
accuracy 0.47 8356
macro avg 0.52 0.52 0.46 8356
weighted avg 0.52 0.47 0.44 8356
=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4 5 6]
Test: (6931, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
precision recall f1-score support
0 0.63 0.96 0.76 1004
1 0.89 0.39 0.55 800
2 0.65 0.80 0.72 1131
3 0.42 0.25 0.31 1103
4 0.46 0.76 0.57 1045
5 0.63 0.23 0.34 930
6 0.56 0.58 0.57 918
accuracy 0.58 6931
macro avg 0.61 0.57 0.55 6931
weighted avg 0.60 0.58 0.55 6931
=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4 5 6]
Test: (163, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
precision recall f1-score support
0 0.00 0.00 0.00 26
1 0.00 0.00 0.00 25
2 0.00 0.00 0.00 44
3 0.05 0.60 0.09 5
4 1.00 0.08 0.14 13
5 0.05 0.23 0.09 13
6 0.69 0.78 0.73 37
accuracy 0.22 163
macro avg 0.26 0.24 0.15 163
weighted avg 0.24 0.22 0.19 163
=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9550, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
precision recall f1-score support
0 0.84 0.26 0.40 1567
1 0.55 0.94 0.69 1562
2 0.52 0.48 0.50 1163
3 0.33 0.43 0.38 1429
4 0.84 0.18 0.30 755
5 0.24 0.44 0.31 1564
6 0.41 0.10 0.16 1510
accuracy 0.42 9550
macro avg 0.53 0.41 0.39 9550
weighted avg 0.51 0.42 0.40 9550
=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9193, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
precision recall f1-score support
0 0.26 0.29 0.28 963
1 0.25 0.53 0.34 619
2 0.72 0.09 0.16 1298
3 0.31 0.59 0.40 1586
4 0.40 0.25 0.31 984
5 0.13 0.13 0.13 1439
6 0.94 0.64 0.76 2304
accuracy 0.39 9193
macro avg 0.43 0.36 0.34 9193
weighted avg 0.50 0.39 0.39 9193
=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2 3 4 5 6]
Test: (7276, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
precision recall f1-score support
0 0.00 0.00 0.00 267
1 0.72 0.86 0.78 666
2 0.01 0.22 0.01 45
3 1.00 0.01 0.01 1763
4 0.25 0.57 0.34 188
5 0.05 0.00 0.01 2151
6 0.55 0.95 0.70 2196
accuracy 0.39 7276
macro avg 0.37 0.37 0.26 7276
weighted avg 0.49 0.39 0.30 7276
=== Overall Accuracy ===
0.4262872123601977 [0.5162824734723747, 0.4743896601244615, 0.5765401817919492, 0.22085889570552147, 0.42303664921465967, 0.3872511693679974, 0.38565145684442004]
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
FUSHeterozygous_Untreated 0.885880 0.587420 0.926519 0.521192 0.942833
FUSHomozygous_Untreated 0.896471 0.566376 0.939791 0.552471 0.942905
FUSRevertant_Untreated 0.858953 0.598071 0.889521 0.388121 0.949718
OPTN_Untreated 0.797771 0.428267 0.883296 0.459277 0.869704
TBK1_Untreated 0.880269 0.270021 0.960384 0.472246 0.909267
TDP43_Untreated 0.752199 0.197120 0.872033 0.249558 0.834192
WT_Untreated 0.846224 0.637135 0.891411 0.559084 0.919141
Macro Average 0.845396 0.469201 0.908994 0.457421 0.909680
from sklearn.neural_network import MLPClassifier
run_baseline_model(
classifier_class=MLPClassifier,
classifier_kwargs={
"hidden_layer_sizes": (100,), # You can adjust e.g., (256, 128)
"activation": "relu",
"solver": "adam",
"max_iter": 200,
"random_state": 42
},
norm=False
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
precision recall f1-score support
0 0.53 0.90 0.67 1222
1 0.71 0.20 0.31 1245
2 0.82 0.86 0.84 1015
3 0.77 0.68 0.72 2314
4 0.07 0.00 0.00 1876
5 0.30 0.63 0.40 1699
6 0.40 0.46 0.43 1561
accuracy 0.51 10932
macro avg 0.51 0.53 0.48 10932
weighted avg 0.50 0.51 0.47 10932
=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4 5 6]
Test: (8356, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
precision recall f1-score support
0 0.58 0.77 0.66 1231
1 0.66 0.38 0.48 1162
2 0.40 0.99 0.57 800
3 0.62 0.67 0.64 1649
4 0.74 0.47 0.57 1220
5 0.40 0.15 0.22 1508
6 0.54 0.64 0.58 786
accuracy 0.55 8356
macro avg 0.56 0.58 0.53 8356
weighted avg 0.57 0.55 0.52 8356
=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4 5 6]
Test: (6931, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
precision recall f1-score support
0 0.65 0.94 0.77 1004
1 0.82 0.39 0.53 800
2 0.58 0.76 0.66 1131
3 0.44 0.26 0.33 1103
4 0.37 0.68 0.48 1045
5 0.43 0.16 0.23 930
6 0.58 0.45 0.51 918
accuracy 0.53 6931
macro avg 0.55 0.52 0.50 6931
weighted avg 0.54 0.53 0.50 6931
=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4 5 6]
Test: (163, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
precision recall f1-score support
0 0.50 0.04 0.07 26
1 0.00 0.00 0.00 25
2 0.00 0.00 0.00 44
3 0.07 0.80 0.12 5
4 0.75 0.23 0.35 13
5 0.06 0.15 0.09 13
6 0.54 0.95 0.69 37
accuracy 0.28 163
macro avg 0.27 0.31 0.19 163
weighted avg 0.27 0.28 0.21 163
=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9550, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
precision recall f1-score support
0 0.89 0.12 0.22 1567
1 0.53 0.98 0.69 1562
2 0.70 0.57 0.63 1163
3 0.36 0.40 0.38 1429
4 0.78 0.22 0.35 755
5 0.30 0.66 0.41 1564
6 0.31 0.06 0.10 1510
accuracy 0.44 9550
macro avg 0.55 0.43 0.40 9550
weighted avg 0.53 0.44 0.39 9550
=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9193, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
precision recall f1-score support
0 0.33 0.24 0.28 963
1 0.22 0.70 0.34 619
2 0.63 0.15 0.24 1298
3 0.30 0.49 0.37 1586
4 0.47 0.49 0.48 984
5 0.15 0.18 0.16 1439
6 0.98 0.34 0.51 2304
accuracy 0.35 9193
macro avg 0.44 0.37 0.34 9193
weighted avg 0.51 0.35 0.35 9193
=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2 3 4 5 6]
Test: (7276, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
precision recall f1-score support
0 0.04 0.01 0.02 267
1 0.69 0.87 0.77 666
2 0.01 0.51 0.02 45
3 1.00 0.02 0.04 1763
4 0.22 0.59 0.32 188
5 0.18 0.02 0.04 2151
6 0.55 0.88 0.68 2196
accuracy 0.38 7276
macro avg 0.38 0.42 0.27 7276
weighted avg 0.53 0.38 0.30 7276
=== Overall Accuracy ===
0.43327501133055335 [0.5113428466886205, 0.549186213499282, 0.530370797864666, 0.27607361963190186, 0.44418848167539265, 0.34504514304362016, 0.37671797691039033]
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
FUSHeterozygous_Untreated 0.892865 0.545541 0.940157 0.553831 0.938245
FUSHomozygous_Untreated 0.883914 0.583155 0.923384 0.499718 0.944070
FUSRevertant_Untreated 0.876453 0.618086 0.906726 0.437082 0.952968
OPTN_Untreated 0.811721 0.442685 0.897138 0.499027 0.874290
TBK1_Untreated 0.876758 0.338431 0.947431 0.458046 0.916027
TDP43_Untreated 0.738287 0.299656 0.832981 0.279191 0.846375
WT_Untreated 0.838114 0.481207 0.915245 0.550965 0.890869
Macro Average 0.845445 0.472680 0.909009 0.468266 0.908978
from sklearn.neural_network import MLPClassifier
run_baseline_model(
classifier_class=MLPClassifier,
classifier_kwargs={
"hidden_layer_sizes": (200,), # You can adjust e.g., (256, 128)
"activation": "relu",
"solver": "adam",
"max_iter": 200,
"random_state": 42
},
norm=False
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
precision recall f1-score support
0 0.50 0.79 0.62 1222
1 0.56 0.23 0.33 1245
2 0.82 0.89 0.85 1015
3 0.78 0.61 0.69 2314
4 0.07 0.00 0.00 1876
5 0.27 0.42 0.33 1699
6 0.32 0.59 0.41 1561
accuracy 0.48 10932
macro avg 0.47 0.51 0.46 10932
weighted avg 0.46 0.48 0.44 10932
=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4 5 6]
Test: (8356, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
precision recall f1-score support
0 0.56 0.83 0.67 1231
1 0.69 0.31 0.42 1162
2 0.40 1.00 0.57 800
3 0.63 0.67 0.65 1649
4 0.69 0.37 0.48 1220
5 0.42 0.12 0.19 1508
6 0.50 0.73 0.59 786
accuracy 0.54 8356
macro avg 0.56 0.58 0.51 8356
weighted avg 0.57 0.54 0.50 8356
=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4 5 6]
Test: (6931, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
precision recall f1-score support
0 0.66 0.97 0.79 1004
1 0.92 0.41 0.57 800
2 0.68 0.76 0.72 1131
3 0.39 0.18 0.25 1103
4 0.39 0.81 0.53 1045
5 0.46 0.22 0.30 930
6 0.57 0.43 0.49 918
accuracy 0.55 6931
macro avg 0.58 0.54 0.52 6931
weighted avg 0.57 0.55 0.52 6931
=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4 5 6]
Test: (163, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
precision recall f1-score support
0 1.00 0.12 0.21 26
1 0.00 0.00 0.00 25
2 0.00 0.00 0.00 44
3 0.06 0.60 0.10 5
4 0.75 0.23 0.35 13
5 0.08 0.31 0.13 13
6 0.65 0.95 0.77 37
accuracy 0.29 163
macro avg 0.36 0.31 0.22 163
weighted avg 0.37 0.29 0.25 163
=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9550, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
precision recall f1-score support
0 0.86 0.15 0.25 1567
1 0.53 0.97 0.69 1562
2 0.66 0.60 0.63 1163
3 0.38 0.51 0.43 1429
4 0.77 0.22 0.34 755
5 0.27 0.51 0.35 1564
6 0.43 0.08 0.13 1510
accuracy 0.44 9550
macro avg 0.56 0.43 0.40 9550
weighted avg 0.54 0.44 0.40 9550
=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9193, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
precision recall f1-score support
0 0.43 0.34 0.38 963
1 0.27 0.69 0.39 619
2 0.74 0.16 0.26 1298
3 0.31 0.52 0.39 1586
4 0.57 0.37 0.45 984
5 0.13 0.22 0.16 1439
6 0.99 0.33 0.49 2304
accuracy 0.35 9193
macro avg 0.49 0.38 0.36 9193
weighted avg 0.55 0.35 0.37 9193
=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2 3 4 5 6]
Test: (7276, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
precision recall f1-score support
0 0.01 0.00 0.01 267
1 0.71 0.88 0.78 666
2 0.02 0.51 0.03 45
3 1.00 0.01 0.01 1763
4 0.25 0.60 0.35 188
5 0.07 0.01 0.02 2151
6 0.51 0.97 0.67 2196
accuracy 0.40 7276
macro avg 0.37 0.43 0.27 7276
weighted avg 0.49 0.40 0.29 7276
=== Overall Accuracy ===
0.43613351337378764 [0.47676545920234176, 0.5385351842987075, 0.551435579281489, 0.294478527607362, 0.4449214659685864, 0.3507016207984336, 0.3960967564595932]
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
FUSHeterozygous_Untreated 0.894353 0.561146 0.939724 0.559010 0.940213
FUSHomozygous_Untreated 0.890327 0.577233 0.931415 0.524828 0.943782
FUSRevertant_Untreated 0.891949 0.633552 0.922226 0.488359 0.955512
OPTN_Untreated 0.809088 0.436389 0.895352 0.491144 0.872829
TBK1_Untreated 0.878628 0.320506 0.951900 0.466603 0.914317
TDP43_Untreated 0.730482 0.241294 0.836091 0.241164 0.836188
WT_Untreated 0.818782 0.529424 0.881315 0.490840 0.896546
Macro Average 0.844801 0.471364 0.908289 0.465993 0.908484
from sklearn.neural_network import MLPClassifier
run_baseline_model(
classifier_class=MLPClassifier,
classifier_kwargs={
"hidden_layer_sizes": (50,), # You can adjust e.g., (256, 128)
"activation": "relu",
"solver": "adam",
"max_iter": 200,
"random_state": 42
},
apply_pca = True,
pca_components = 100
)
=== Batch 1 === Train: (41469, 5568) Labels: [0 1 2 3 4 5 6] Test: (10932, 5568) Labels: [0 1 2 3 4 5 6] FUSHeterozygous_Untreated: 5058 FUSHomozygous_Untreated: 4834 FUSRevertant_Untreated: 4481 OPTN_Untreated: 7535 TBK1_Untreated: 4205 TDP43_Untreated: 7605 WT_Untreated: 7751
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/neural_network/_multilayer_perceptron.py:691: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet. warnings.warn(
precision recall f1-score support
0 0.38 0.53 0.44 1222
1 0.24 0.14 0.18 1245
2 0.59 0.68 0.63 1015
3 0.47 0.52 0.50 2314
4 0.03 0.00 0.00 1876
5 0.31 0.59 0.41 1699
6 0.37 0.33 0.35 1561
accuracy 0.39 10932
macro avg 0.34 0.40 0.36 10932
weighted avg 0.33 0.39 0.35 10932
here!
here2!
here3!
=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4 5 6]
Test: (8356, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/neural_network/_multilayer_perceptron.py:691: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet. warnings.warn(
precision recall f1-score support
0 0.59 0.79 0.67 1231
1 0.69 0.40 0.50 1162
2 0.23 0.98 0.38 800
3 0.53 0.47 0.50 1649
4 0.58 0.18 0.27 1220
5 0.13 0.02 0.03 1508
6 0.56 0.45 0.50 786
accuracy 0.43 8356
macro avg 0.47 0.47 0.41 8356
weighted avg 0.47 0.43 0.40 8356
here!
here2!
here3!
=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4 5 6]
Test: (6931, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/neural_network/_multilayer_perceptron.py:691: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet. warnings.warn(
precision recall f1-score support
0 0.63 0.83 0.72 1004
1 0.66 0.40 0.50 800
2 0.74 0.53 0.62 1131
3 0.27 0.21 0.24 1103
4 0.37 0.84 0.52 1045
5 0.40 0.11 0.18 930
6 0.44 0.38 0.41 918
accuracy 0.48 6931
macro avg 0.50 0.47 0.45 6931
weighted avg 0.50 0.48 0.46 6931
here!
here2!
here3!
=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4 5 6]
Test: (163, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/neural_network/_multilayer_perceptron.py:691: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet. warnings.warn( /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
precision recall f1-score support
0 0.56 0.35 0.43 26
1 0.00 0.00 0.00 25
2 0.00 0.00 0.00 44
3 0.06 1.00 0.11 5
4 0.00 0.00 0.00 13
5 0.03 0.08 0.04 13
6 0.96 0.59 0.73 37
accuracy 0.23 163
macro avg 0.23 0.29 0.19 163
weighted avg 0.31 0.23 0.24 163
here!
here2!
here3!
=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9550, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/neural_network/_multilayer_perceptron.py:691: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet. warnings.warn(
precision recall f1-score support
0 0.94 0.19 0.31 1567
1 0.55 0.98 0.70 1562
2 0.28 0.14 0.19 1163
3 0.25 0.36 0.30 1429
4 0.43 0.30 0.36 755
5 0.26 0.43 0.33 1564
6 0.30 0.15 0.20 1510
accuracy 0.38 9550
macro avg 0.43 0.36 0.34 9550
weighted avg 0.44 0.38 0.35 9550
here!
here2!
here3!
=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9193, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/neural_network/_multilayer_perceptron.py:691: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet. warnings.warn(
precision recall f1-score support
0 0.16 0.09 0.11 963
1 0.23 0.81 0.36 619
2 0.14 0.04 0.06 1298
3 0.32 0.40 0.36 1586
4 0.39 0.38 0.38 984
5 0.15 0.22 0.18 1439
6 0.90 0.43 0.58 2304
accuracy 0.32 9193
macro avg 0.33 0.34 0.29 9193
weighted avg 0.40 0.32 0.32 9193
here!
here2!
here3!
=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2 3 4 5 6]
Test: (7276, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/neural_network/_multilayer_perceptron.py:691: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet. warnings.warn(
precision recall f1-score support
0 0.02 0.01 0.01 267
1 0.66 0.76 0.71 666
2 0.00 0.27 0.01 45
3 0.90 0.01 0.01 1763
4 0.22 0.25 0.23 188
5 0.02 0.00 0.00 2151
6 0.62 0.84 0.72 2196
accuracy 0.33 7276
macro avg 0.35 0.30 0.24 7276
weighted avg 0.48 0.33 0.29 7276
here!
here2!
here3!
=== Overall Accuracy ===
0.36560145872024213 [0.38922429564581046, 0.43011010052656773, 0.47972875486942723, 0.22699386503067484, 0.37926701570680627, 0.320461220493854, 0.33342495876855416]
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
FUSHeterozygous_Untreated 0.880327 0.452229 0.938618 0.500794 0.926386
FUSHomozygous_Untreated 0.872216 0.574272 0.911316 0.459403 0.942235
FUSRevertant_Untreated 0.804183 0.419760 0.849227 0.245975 0.925875
OPTN_Untreated 0.768554 0.345111 0.866563 0.374463 0.851122
TBK1_Untreated 0.863037 0.287124 0.938644 0.380558 0.909334
TDP43_Untreated 0.742581 0.228289 0.853609 0.251868 0.836699
WT_Untreated 0.840270 0.461555 0.922115 0.561536 0.887948
Macro Average 0.824453 0.395477 0.897156 0.396371 0.897086
from sklearn.neural_network import MLPClassifier
run_baseline_model(
classifier_class=MLPClassifier,
classifier_kwargs={
"hidden_layer_sizes": (100,), # You can adjust e.g., (256, 128)
"activation": "relu",
"solver": "adam",
"max_iter": 200,
"random_state": 42
},
apply_pca = True,
pca_components = 200
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
precision recall f1-score support
0 0.46 0.79 0.58 1222
1 0.43 0.14 0.22 1245
2 0.61 0.78 0.69 1015
3 0.54 0.45 0.49 2314
4 0.07 0.00 0.01 1876
5 0.28 0.39 0.33 1699
6 0.30 0.53 0.38 1561
accuracy 0.41 10932
macro avg 0.39 0.44 0.39 10932
weighted avg 0.37 0.41 0.36 10932
=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4 5 6]
Test: (8356, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
precision recall f1-score support
0 0.60 0.74 0.67 1231
1 0.73 0.45 0.56 1162
2 0.27 0.98 0.43 800
3 0.54 0.40 0.46 1649
4 0.49 0.32 0.39 1220
5 0.17 0.05 0.08 1508
6 0.54 0.51 0.52 786
accuracy 0.45 8356
macro avg 0.48 0.49 0.44 8356
weighted avg 0.48 0.45 0.43 8356
=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4 5 6]
Test: (6931, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
precision recall f1-score support
0 0.65 0.91 0.76 1004
1 0.80 0.41 0.54 800
2 0.71 0.61 0.66 1131
3 0.29 0.20 0.24 1103
4 0.37 0.83 0.51 1045
5 0.43 0.14 0.21 930
6 0.55 0.44 0.49 918
accuracy 0.51 6931
macro avg 0.54 0.51 0.49 6931
weighted avg 0.53 0.51 0.49 6931
=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4 5 6]
Test: (163, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
precision recall f1-score support
0 0.80 0.15 0.26 26
1 0.00 0.00 0.00 25
2 0.00 0.00 0.00 44
3 0.04 1.00 0.09 5
4 0.00 0.00 0.00 13
5 0.00 0.00 0.00 13
6 0.68 0.51 0.58 37
accuracy 0.17 163
macro avg 0.22 0.24 0.13 163
weighted avg 0.28 0.17 0.18 163
=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9550, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
precision recall f1-score support
0 0.83 0.20 0.32 1567
1 0.54 0.92 0.68 1562
2 0.45 0.24 0.31 1163
3 0.29 0.46 0.36 1429
4 0.58 0.20 0.30 755
5 0.20 0.30 0.24 1564
6 0.31 0.20 0.24 1510
accuracy 0.38 9550
macro avg 0.46 0.36 0.35 9550
weighted avg 0.45 0.38 0.36 9550
=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9193, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
precision recall f1-score support
0 0.30 0.22 0.26 963
1 0.24 0.70 0.35 619
2 0.25 0.06 0.10 1298
3 0.33 0.43 0.38 1586
4 0.33 0.29 0.31 984
5 0.17 0.24 0.20 1439
6 0.87 0.50 0.63 2304
accuracy 0.35 9193
macro avg 0.35 0.35 0.32 9193
weighted avg 0.42 0.35 0.35 9193
=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2 3 4 5 6]
Test: (7276, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
precision recall f1-score support
0 0.01 0.00 0.00 267
1 0.65 0.72 0.68 666
2 0.00 0.18 0.01 45
3 0.87 0.01 0.03 1763
4 0.22 0.42 0.29 188
5 0.10 0.01 0.02 2151
6 0.53 0.91 0.67 2196
accuracy 0.36 7276
macro avg 0.34 0.32 0.24 7276
weighted avg 0.46 0.36 0.29 7276
=== Overall Accuracy ===
0.3759157995635852 [0.4101719721917307, 0.45021541407371946, 0.5129129995671621, 0.17177914110429449, 0.37863874345549736, 0.3473294898292179, 0.3603628367234744]
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
FUSHeterozygous_Untreated 0.887101 0.530414 0.935669 0.528898 0.936035
FUSHomozygous_Untreated 0.883266 0.557164 0.926061 0.497211 0.940951
FUSRevertant_Untreated 0.841034 0.479803 0.883360 0.325234 0.935453
OPTN_Untreated 0.778821 0.336278 0.881251 0.395935 0.851553
TBK1_Untreated 0.861701 0.293866 0.936248 0.377004 0.909905
TDP43_Untreated 0.738994 0.183147 0.858993 0.218995 0.829673
WT_Untreated 0.820519 0.547895 0.879436 0.495484 0.900010
Macro Average 0.830205 0.418367 0.900145 0.405537 0.900511
from sklearn.neural_network import MLPClassifier
run_baseline_model(
classifier_class=MLPClassifier,
classifier_kwargs={
"hidden_layer_sizes": (100,50), # You can adjust e.g., (256, 128)
"activation": "relu",
"solver": "adam",
"max_iter": 200,
"random_state": 42
},
apply_pca = False,
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
precision recall f1-score support
0 0.50 0.90 0.65 1222
1 0.58 0.12 0.20 1245
2 0.73 0.90 0.80 1015
3 0.79 0.67 0.72 2314
4 0.11 0.00 0.00 1876
5 0.28 0.43 0.34 1699
6 0.31 0.50 0.38 1561
accuracy 0.48 10932
macro avg 0.47 0.50 0.44 10932
weighted avg 0.46 0.48 0.43 10932
=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4 5 6]
Test: (8356, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
precision recall f1-score support
0 0.59 0.85 0.70 1231
1 0.74 0.36 0.48 1162
2 0.34 1.00 0.51 800
3 0.67 0.61 0.64 1649
4 0.72 0.38 0.50 1220
5 0.33 0.12 0.18 1508
6 0.57 0.72 0.64 786
accuracy 0.54 8356
macro avg 0.57 0.58 0.52 8356
weighted avg 0.57 0.54 0.51 8356
=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4 5 6]
Test: (6931, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
precision recall f1-score support
0 0.62 0.97 0.76 1004
1 0.88 0.29 0.43 800
2 0.65 0.78 0.71 1131
3 0.38 0.28 0.32 1103
4 0.42 0.75 0.54 1045
5 0.38 0.10 0.16 930
6 0.58 0.49 0.53 918
accuracy 0.54 6931
macro avg 0.56 0.52 0.49 6931
weighted avg 0.55 0.54 0.50 6931
=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4 5 6]
Test: (163, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
precision recall f1-score support
0 0.33 0.04 0.07 26
1 0.00 0.00 0.00 25
2 0.00 0.00 0.00 44
3 0.05 0.60 0.08 5
4 0.67 0.15 0.25 13
5 0.12 0.23 0.16 13
6 0.53 0.95 0.68 37
accuracy 0.27 163
macro avg 0.24 0.28 0.18 163
weighted avg 0.24 0.27 0.20 163
=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9550, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
precision recall f1-score support
0 0.93 0.11 0.20 1567
1 0.52 0.98 0.68 1562
2 0.71 0.52 0.60 1163
3 0.35 0.42 0.38 1429
4 0.68 0.17 0.28 755
5 0.28 0.59 0.38 1564
6 0.37 0.10 0.15 1510
accuracy 0.43 9550
macro avg 0.55 0.41 0.38 9550
weighted avg 0.54 0.43 0.38 9550
=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9193, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
precision recall f1-score support
0 0.35 0.33 0.34 963
1 0.26 0.65 0.37 619
2 0.80 0.15 0.26 1298
3 0.28 0.53 0.37 1586
4 0.38 0.38 0.38 984
5 0.10 0.10 0.10 1439
6 0.96 0.44 0.61 2304
accuracy 0.36 9193
macro avg 0.45 0.37 0.35 9193
weighted avg 0.51 0.36 0.37 9193
=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2 3 4 5 6]
Test: (7276, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
precision recall f1-score support
0 0.03 0.01 0.01 267
1 0.70 0.89 0.78 666
2 0.01 0.18 0.01 45
3 0.84 0.02 0.04 1763
4 0.25 0.69 0.37 188
5 0.03 0.00 0.01 2151
6 0.49 0.96 0.65 2196
accuracy 0.40 7276
macro avg 0.33 0.39 0.27 7276
weighted avg 0.43 0.40 0.29 7276
=== Overall Accuracy ===
0.4296564697543368 [0.47923527259421883, 0.5351842987075156, 0.5380176020776223, 0.26993865030674846, 0.42921465968586386, 0.358533666920483, 0.39747113798790545]
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
FUSHeterozygous_Untreated 0.890556 0.577707 0.933154 0.540605 0.941957
FUSHomozygous_Untreated 0.888475 0.546965 0.933293 0.518316 0.940112
FUSRevertant_Untreated 0.885308 0.618814 0.916533 0.464872 0.953532
OPTN_Untreated 0.804317 0.441263 0.888348 0.477740 0.872922
TBK1_Untreated 0.875079 0.309653 0.949309 0.445048 0.912850
TDP43_Untreated 0.739795 0.224097 0.851127 0.245265 0.835558
WT_Untreated 0.823744 0.548003 0.883334 0.503751 0.900428
Macro Average 0.843896 0.466643 0.907871 0.456514 0.908194
# Label mapping
label_map = {
'WT_Untreated': 0,
'FUSRevertant_Untreated': 0,
'TBK1_Untreated': 1,
'OPTN_Untreated': 1,
'TDP43_Untreated':1,
'FUSHeterozygous_Untreated': 2,
'FUSHomozygous_Untreated': 2,
}
run_baseline_model(
batches=[1, 2, 3, 7, 8, 9, 10],
balance=False,
norm=False,
choose_features=False,
top_k=100,
classifier_class=cuMLLogisticRegression,
classifier_kwargs={},
label_map = label_map
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2]
Test: (10932, 5568) Labels: [0 1 2]
2: 9892
0: 12232
1: 19345
precision recall f1-score support
0 0.39 0.86 0.53 2576
1 0.87 0.41 0.56 5889
2 1.00 1.00 1.00 2467
accuracy 0.65 10932
macro avg 0.75 0.76 0.70 10932
weighted avg 0.78 0.65 0.65 10932
=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2]
Test: (8356, 5568) Labels: [0 1 2]
2: 9966
0: 13222
1: 20857
precision recall f1-score support
0 0.50 0.86 0.63 1586
1 0.92 0.69 0.79 4377
2 1.00 0.99 0.99 2393
accuracy 0.81 8356
macro avg 0.81 0.85 0.81 8356
weighted avg 0.87 0.81 0.82 8356
=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2]
Test: (6931, 5568) Labels: [0 1 2]
2: 10555
0: 12759
1: 22156
precision recall f1-score support
0 0.76 0.57 0.65 2049
1 0.75 0.88 0.81 3078
2 1.00 1.00 1.00 1804
accuracy 0.82 6931
macro avg 0.84 0.82 0.82 6931
weighted avg 0.82 0.82 0.81 6931
=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2]
Test: (163, 5568) Labels: [0 1 2]
2: 12308
0: 14727
1: 25203
precision recall f1-score support
0 0.80 0.05 0.09 81
1 0.20 1.00 0.33 31
2 1.00 0.02 0.04 51
accuracy 0.22 163
macro avg 0.67 0.36 0.15 163
weighted avg 0.75 0.22 0.12 163
=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2]
Test: (9550, 5568) Labels: [0 1 2]
2: 9230
0: 12135
1: 21486
precision recall f1-score support
0 0.54 0.62 0.58 2673
1 0.70 0.62 0.66 3748
2 1.00 1.00 1.00 3129
accuracy 0.75 9550
macro avg 0.75 0.75 0.75 9550
weighted avg 0.75 0.75 0.75 9550
=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2]
Test: (9193, 5568) Labels: [0 1 2]
2: 10777
0: 11206
1: 21225
precision recall f1-score support
0 0.94 0.17 0.29 3602
1 0.51 0.76 0.61 4009
2 0.63 1.00 0.77 1582
accuracy 0.57 9193
macro avg 0.69 0.64 0.56 9193
weighted avg 0.70 0.57 0.51 9193
=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2]
Test: (7276, 5568) Labels: [0 1 2]
2: 11426
0: 12567
1: 21132
precision recall f1-score support
0 0.37 0.99 0.54 2241
1 0.93 0.08 0.15 4102
2 1.00 1.00 1.00 933
accuracy 0.48 7276
macro avg 0.77 0.69 0.56 7276
weighted avg 0.77 0.48 0.38 7276
=== Overall Accuracy ===
0.6129945008100668 [0.6478229052323454, 0.807084729535663, 0.8184966094358679, 0.22085889570552147, 0.7471204188481675, 0.5697813553790928, 0.47979659153380977]
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
0 0.694605 0.622771 0.722901 0.469576 0.829498
1 0.675789 0.550765 0.791917 0.710859 0.654916
2 0.980687 0.993527 0.976724 0.929453 0.997959
Macro Average 0.783694 0.722354 0.830514 0.703296 0.827458
# Label mapping
label_map = {
'WT_Untreated': 0,
'FUSRevertant_Untreated': 0,
'TBK1_Untreated': 1,
'OPTN_Untreated': 1,
'TDP43_Untreated':1,
'FUSHeterozygous_Untreated': 2,
'FUSHomozygous_Untreated': 2,
}
run_baseline_model(
batches=[1, 2, 3, 7, 8, 9, 10],
balance=True,
norm=False,
choose_features=False,
top_k=100,
classifier_class=cuMLLogisticRegression,
classifier_kwargs={},
label_map = label_map
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2]
Test: (10932, 5568) Labels: [0 1 2]
2: 9892
0: 12232
1: 19345
precision recall f1-score support
0 0.38 0.89 0.53 2576
1 0.88 0.35 0.51 5889
2 1.00 1.00 1.00 2467
accuracy 0.63 10932
macro avg 0.75 0.75 0.68 10932
weighted avg 0.79 0.63 0.62 10932
=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2]
Test: (8356, 5568) Labels: [0 1 2]
2: 9966
0: 13222
1: 20857
precision recall f1-score support
0 0.46 0.89 0.61 1586
1 0.93 0.63 0.75 4377
2 1.00 0.99 1.00 2393
accuracy 0.78 8356
macro avg 0.80 0.84 0.79 8356
weighted avg 0.86 0.78 0.80 8356
=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2]
Test: (6931, 5568) Labels: [0 1 2]
2: 10555
0: 12759
1: 22156
precision recall f1-score support
0 0.72 0.64 0.68 2049
1 0.78 0.83 0.80 3078
2 1.00 1.00 1.00 1804
accuracy 0.82 6931
macro avg 0.83 0.82 0.83 6931
weighted avg 0.82 0.82 0.82 6931
=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2]
Test: (163, 5568) Labels: [0 1 2]
2: 12308
0: 14727
1: 25203
precision recall f1-score support
0 0.88 0.09 0.16 81
1 0.20 1.00 0.34 31
2 1.00 0.04 0.08 51
accuracy 0.25 163
macro avg 0.69 0.38 0.19 163
weighted avg 0.79 0.25 0.17 163
=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2]
Test: (9550, 5568) Labels: [0 1 2]
2: 9230
0: 12135
1: 21486
precision recall f1-score support
0 0.52 0.73 0.60 2673
1 0.73 0.51 0.60 3748
2 1.00 1.00 1.00 3129
accuracy 0.73 9550
macro avg 0.75 0.75 0.73 9550
weighted avg 0.76 0.73 0.73 9550
=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2]
Test: (9193, 5568) Labels: [0 1 2]
2: 10777
0: 11206
1: 21225
precision recall f1-score support
0 0.93 0.24 0.39 3602
1 0.51 0.71 0.60 4009
2 0.59 1.00 0.74 1582
accuracy 0.58 9193
macro avg 0.68 0.65 0.57 9193
weighted avg 0.69 0.58 0.54 9193
=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2]
Test: (7276, 5568) Labels: [0 1 2]
2: 11426
0: 12567
1: 21132
precision recall f1-score support
0 0.37 0.99 0.54 2241
1 0.95 0.07 0.13 4102
2 1.00 1.00 1.00 933
accuracy 0.47 7276
macro avg 0.77 0.69 0.56 7276
weighted avg 0.78 0.47 0.37 7276
=== Overall Accuracy ===
0.6082226606460679 [0.6268752286864252, 0.7826711345141216, 0.8195065647092772, 0.24539877300613497, 0.7320418848167539, 0.577178287827695, 0.47388675096206706]
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
0 0.687468 0.681186 0.689942 0.463919 0.846011
1 0.665503 0.494729 0.824125 0.723207 0.637155
2 0.977462 0.994174 0.972304 0.917214 0.998154
Macro Average 0.776811 0.723363 0.828790 0.701447 0.827107
# Label mapping
label_map = {
'WT_Untreated': 0,
'FUSRevertant_Untreated': 0,
'TBK1_Untreated': 1,
'OPTN_Untreated': 1,
'TDP43_Untreated':1,
'FUSHeterozygous_Untreated': 2,
'FUSHomozygous_Untreated': 2,
}
run_baseline_model(
batches=[1, 2, 3, 7, 8, 9, 10],
balance=False,
norm=False,
choose_features=True,
top_k=100,
classifier_class=cuMLLogisticRegression,
classifier_kwargs={},
label_map = label_map
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2]
Test: (10932, 5568) Labels: [0 1 2]
2: 9892
0: 12232
1: 19345
Selecting top 100 features...
precision recall f1-score support
0 0.19 0.36 0.25 2576
1 0.54 0.33 0.41 5889
2 1.00 0.99 1.00 2467
accuracy 0.49 10932
macro avg 0.58 0.56 0.55 10932
weighted avg 0.56 0.49 0.51 10932
=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2]
Test: (8356, 5568) Labels: [0 1 2]
2: 9966
0: 13222
1: 20857
Selecting top 100 features...
precision recall f1-score support
0 0.25 0.20 0.22 1586
1 0.72 0.78 0.75 4377
2 1.00 0.98 0.99 2393
accuracy 0.73 8356
macro avg 0.66 0.65 0.65 8356
weighted avg 0.71 0.73 0.72 8356
=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2]
Test: (6931, 5568) Labels: [0 1 2]
2: 10555
0: 12759
1: 22156
Selecting top 100 features...
precision recall f1-score support
0 0.42 0.10 0.16 2049
1 0.60 0.91 0.72 3078
2 1.00 0.98 0.99 1804
accuracy 0.69 6931
macro avg 0.67 0.66 0.63 6931
weighted avg 0.65 0.69 0.63 6931
=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2]
Test: (163, 5568) Labels: [0 1 2]
2: 12308
0: 14727
1: 25203
Selecting top 100 features...
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
precision recall f1-score support
0 0.00 0.00 0.00 81
1 0.20 1.00 0.33 31
2 0.83 0.10 0.18 51
accuracy 0.22 163
macro avg 0.34 0.37 0.17 163
weighted avg 0.30 0.22 0.12 163
=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2]
Test: (9550, 5568) Labels: [0 1 2]
2: 9230
0: 12135
1: 21486
Selecting top 100 features...
precision recall f1-score support
0 0.25 0.17 0.21 2673
1 0.52 0.64 0.57 3748
2 1.00 0.99 1.00 3129
accuracy 0.62 9550
macro avg 0.59 0.60 0.59 9550
weighted avg 0.60 0.62 0.61 9550
=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2]
Test: (9193, 5568) Labels: [0 1 2]
2: 10777
0: 11206
1: 21225
Selecting top 100 features...
precision recall f1-score support
0 0.69 0.00 0.01 3602
1 0.38 0.54 0.44 4009
2 0.46 1.00 0.63 1582
accuracy 0.41 9193
macro avg 0.51 0.51 0.36 9193
weighted avg 0.51 0.41 0.31 9193
=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2]
Test: (7276, 5568) Labels: [0 1 2]
2: 11426
0: 12567
1: 21132
Selecting top 100 features...
precision recall f1-score support
0 0.32 0.36 0.34 2241
1 0.62 0.58 0.60 4102
2 1.00 0.99 1.00 933
accuracy 0.57 7276
macro avg 0.65 0.64 0.65 7276
weighted avg 0.58 0.57 0.57 7276
=== Overall Accuracy ===
0.5320315149283343 [0.4880168313208928, 0.7277405457156534, 0.6885009378156111, 0.22085889570552147, 0.6231413612565445, 0.409441966713804, 0.5665200659703133]
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
0 0.612240 0.184630 0.780677 0.249021 0.708512
1 0.573749 0.600460 0.548938 0.552872 0.596639
2 0.961356 0.985112 0.954023 0.868650 0.995206
Macro Average 0.715782 0.590067 0.761213 0.556848 0.766786
# Label mapping
label_map = {
'WT_Untreated': 0,
'FUSRevertant_Untreated': 0,
'TBK1_Untreated': 1,
'OPTN_Untreated': 1,
'TDP43_Untreated':1,
'FUSHeterozygous_Untreated': 2,
'FUSHomozygous_Untreated': 2,
}
run_baseline_model(
batches=[1, 2, 3, 7, 8, 9, 10],
balance=False,
norm=False,
choose_features=False,
top_k=100,
classifier_class=cuMLLogisticRegression,
classifier_kwargs={},
label_map = label_map,
apply_pca=True
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2]
Test: (10932, 5568) Labels: [0 1 2]
2: 9892
0: 12232
1: 19345
precision recall f1-score support
0 0.27 0.61 0.38 2576
1 0.63 0.29 0.40 5889
2 1.00 1.00 1.00 2467
accuracy 0.52 10932
macro avg 0.63 0.63 0.59 10932
weighted avg 0.63 0.52 0.53 10932
=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2]
Test: (8356, 5568) Labels: [0 1 2]
2: 9966
0: 13222
1: 20857
precision recall f1-score support
0 0.41 0.62 0.50 1586
1 0.82 0.69 0.75 4377
2 1.00 0.98 0.99 2393
accuracy 0.76 8356
macro avg 0.75 0.76 0.74 8356
weighted avg 0.80 0.76 0.77 8356
=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2]
Test: (6931, 5568) Labels: [0 1 2]
2: 10555
0: 12759
1: 22156
precision recall f1-score support
0 0.76 0.13 0.22 2049
1 0.63 0.97 0.76 3078
2 1.00 1.00 1.00 1804
accuracy 0.73 6931
macro avg 0.80 0.70 0.66 6931
weighted avg 0.76 0.73 0.66 6931
=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2]
Test: (163, 5568) Labels: [0 1 2]
2: 12308
0: 14727
1: 25203
precision recall f1-score support
0 1.00 0.01 0.02 81
1 0.19 1.00 0.32 31
2 1.00 0.02 0.04 51
accuracy 0.20 163
macro avg 0.73 0.34 0.13 163
weighted avg 0.85 0.20 0.09 163
=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2]
Test: (9550, 5568) Labels: [0 1 2]
2: 9230
0: 12135
1: 21486
precision recall f1-score support
0 0.57 0.36 0.44 2673
1 0.64 0.81 0.71 3748
2 1.00 1.00 1.00 3129
accuracy 0.74 9550
macro avg 0.74 0.72 0.72 9550
weighted avg 0.74 0.74 0.73 9550
=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2]
Test: (9193, 5568) Labels: [0 1 2]
2: 10777
0: 11206
1: 21225
precision recall f1-score support
0 0.79 0.04 0.08 3602
1 0.48 0.81 0.61 4009
2 0.68 1.00 0.81 1582
accuracy 0.54 9193
macro avg 0.65 0.62 0.50 9193
weighted avg 0.64 0.54 0.44 9193
=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2]
Test: (7276, 5568) Labels: [0 1 2]
2: 11426
0: 12567
1: 21132
precision recall f1-score support
0 0.36 0.98 0.53 2241
1 0.83 0.06 0.12 4102
2 1.00 1.00 1.00 933
accuracy 0.46 7276
macro avg 0.73 0.68 0.55 7276
weighted avg 0.71 0.46 0.36 7276
=== Overall Accuracy ===
0.5661394223845085 [0.5241492864983535, 0.7571804691239827, 0.7296205453758476, 0.20245398773006135, 0.7442931937172775, 0.540737517676493, 0.4645409565695437]
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
0 0.638213 0.412480 0.727130 0.373213 0.758568
1 0.622717 0.564794 0.676519 0.618576 0.625966
2 0.983550 0.990857 0.981295 0.942362 0.997132
Macro Average 0.748160 0.656043 0.794981 0.644717 0.793889
# Label mapping
label_map = {
'WT_Untreated': 0,
'FUSRevertant_Untreated': 0,
'TBK1_Untreated': 1,
'OPTN_Untreated': 3,
'TDP43_Untreated':4,
'FUSHeterozygous_Untreated': 2,
'FUSHomozygous_Untreated': 2,
}
run_baseline_model(
batches=[1, 2, 3, 7, 8, 9, 10],
balance=True,
norm=False,
choose_features=False,
top_k=100,
classifier_class=cuMLLogisticRegression,
classifier_kwargs={},
label_map = label_map
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4]
Test: (10932, 5568) Labels: [0 1 2 3 4]
2: 9892
0: 12232
3: 7535
1: 4205
4: 7605
precision recall f1-score support
0 0.38 0.75 0.50 2576
1 0.14 0.00 0.01 1876
2 1.00 1.00 1.00 2467
3 0.79 0.46 0.58 2314
4 0.35 0.40 0.37 1699
accuracy 0.56 10932
macro avg 0.53 0.52 0.49 10932
weighted avg 0.56 0.56 0.52 10932
=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4]
Test: (8356, 5568) Labels: [0 1 2 3 4]
2: 9966
0: 13222
3: 8200
1: 4861
4: 7796
precision recall f1-score support
0 0.48 0.86 0.61 1586
1 0.69 0.68 0.68 1220
2 1.00 0.99 0.99 2393
3 0.62 0.56 0.59 1649
4 0.34 0.10 0.15 1508
accuracy 0.67 8356
macro avg 0.62 0.64 0.61 8356
weighted avg 0.66 0.67 0.64 8356
=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4]
Test: (6931, 5568) Labels: [0 1 2 3 4]
2: 10555
0: 12759
3: 8746
1: 5036
4: 8374
precision recall f1-score support
0 0.75 0.59 0.66 2049
1 0.44 0.89 0.59 1045
2 1.00 1.00 1.00 1804
3 0.50 0.33 0.39 1103
4 0.47 0.33 0.39 930
accuracy 0.67 6931
macro avg 0.63 0.63 0.61 6931
weighted avg 0.69 0.67 0.66 6931
=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4]
Test: (163, 5568) Labels: [0 1 2 3 4]
2: 12308
0: 14727
3: 9844
1: 6068
4: 9291
precision recall f1-score support
0 1.00 0.12 0.22 81
1 0.50 0.08 0.13 13
2 1.00 0.02 0.04 51
3 0.05 0.80 0.10 5
4 0.05 0.31 0.09 13
accuracy 0.12 163
macro avg 0.52 0.27 0.12 163
weighted avg 0.86 0.12 0.14 163
=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4]
Test: (9550, 5568) Labels: [0 1 2 3 4]
2: 9230
0: 12135
3: 8420
1: 5326
4: 7740
precision recall f1-score support
0 0.47 0.43 0.45 2673
1 0.71 0.13 0.22 755
2 1.00 1.00 1.00 3129
3 0.39 0.19 0.25 1429
4 0.28 0.56 0.37 1564
accuracy 0.58 9550
macro avg 0.57 0.46 0.46 9550
weighted avg 0.62 0.58 0.57 9550
=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4]
Test: (9193, 5568) Labels: [0 1 2 3 4]
2: 10777
0: 11206
3: 8263
1: 5097
4: 7865
# Label mapping
label_map = {
'WT_Untreated': 0,
'FUSRevertant_Untreated': 1,
'TBK1_Untreated': 2,
'OPTN_Untreated': 3,
'TDP43_Untreated':4,
'FUSHeterozygous_Untreated': 5,
'FUSHomozygous_Untreated': 6,
}
run_baseline_model(
batches=[1, 2, 3, 7, 8, 9, 10],
balance=False,
norm=False,
choose_features=False,
top_k=100,
classifier_class=cuMLLogisticRegression,
classifier_kwargs={'class_weight':{0: 10, 1: 1.0, 2: 1.0, 3: 1, 4: 1.0, 5: 1.0, 6:1}},
label_map = label_map
)
run_baseline_model(
batches=[1, 2, 3, 7, 8, 9, 10],
balance=False,
norm=False,
choose_features=False,
top_k=100,
apply_pca=False,
pca_components=50,
label_map=None,
classifier_class=cuMLLogisticRegression,
classifier_kwargs={},
return_proba=False
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
precision recall f1-score support
0 0.66 0.91 0.76 1222
1 0.87 0.53 0.66 1245
2 0.81 0.91 0.86 1015
3 0.79 0.50 0.61 2314
4 0.13 0.00 0.00 1876
5 0.28 0.30 0.29 1699
6 0.28 0.71 0.40 1561
accuracy 0.50 10932
macro avg 0.54 0.55 0.51 10932
weighted avg 0.52 0.50 0.47 10932
=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4 5 6]
Test: (8356, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
precision recall f1-score support
0 0.63 0.64 0.63 1231
1 0.62 0.59 0.60 1162
2 0.48 0.99 0.64 800
3 0.57 0.65 0.61 1649
4 0.64 0.61 0.62 1220
5 0.39 0.14 0.21 1508
6 0.61 0.55 0.58 786
accuracy 0.57 8356
macro avg 0.56 0.60 0.56 8356
weighted avg 0.56 0.57 0.54 8356
=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4 5 6]
Test: (6931, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
precision recall f1-score support
0 0.73 0.97 0.84 1004
1 0.94 0.57 0.71 800
2 0.70 0.73 0.72 1131
3 0.47 0.40 0.43 1103
4 0.42 0.82 0.56 1045
5 0.49 0.38 0.43 930
6 0.57 0.17 0.26 918
accuracy 0.59 6931
macro avg 0.62 0.58 0.56 6931
weighted avg 0.61 0.59 0.56 6931
=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4 5 6]
Test: (163, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
precision recall f1-score support
0 0.00 0.00 0.00 26
1 0.00 0.00 0.00 25
2 0.00 0.00 0.00 44
3 0.06 0.80 0.12 5
4 0.50 0.08 0.13 13
5 0.07 0.23 0.11 13
6 0.62 0.97 0.76 37
accuracy 0.27 163
macro avg 0.18 0.30 0.16 163
weighted avg 0.19 0.27 0.20 163
=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9550, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
precision recall f1-score support
0 0.81 0.19 0.31 1567
1 0.54 0.95 0.69 1562
2 0.65 0.45 0.53 1163
3 0.43 0.24 0.31 1429
4 0.73 0.14 0.23 755
5 0.29 0.64 0.40 1564
6 0.43 0.36 0.39 1510
accuracy 0.45 9550
macro avg 0.55 0.42 0.41 9550
weighted avg 0.54 0.45 0.42 9550
=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9193, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
precision recall f1-score support
0 0.19 0.27 0.22 963
1 0.28 0.54 0.37 619
2 0.79 0.11 0.19 1298
3 0.18 0.44 0.26 1586
4 0.34 0.24 0.28 984
5 0.10 0.09 0.10 1439
6 0.94 0.26 0.41 2304
accuracy 0.26 9193
macro avg 0.40 0.28 0.26 9193
weighted avg 0.47 0.26 0.27 9193
=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2 3 4 5 6]
Test: (7276, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
precision recall f1-score support
0 0.59 0.21 0.31 267
1 0.75 0.94 0.84 666
2 0.04 0.87 0.08 45
3 0.96 0.03 0.05 1763
4 0.28 0.45 0.35 188
5 0.09 0.01 0.01 2151
6 0.43 0.98 0.60 2196
accuracy 0.42 7276
macro avg 0.45 0.50 0.32 7276
weighted avg 0.49 0.42 0.30 7276
=== Overall Accuracy ===
0.43597176371995305 [0.5032930845225028, 0.5667783628530397, 0.5854854999278604, 0.26993865030674846, 0.45057591623036647, 0.2599804198846949, 0.4157504123144585]
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
FUSHeterozygous_Untreated 0.896433 0.557006 0.942651 0.569429 0.939859
FUSHomozygous_Untreated 0.910059 0.695016 0.938280 0.596414 0.959088
FUSRevertant_Untreated 0.907292 0.591157 0.944334 0.554437 0.951720
OPTN_Untreated 0.784909 0.382577 0.878032 0.420630 0.860023
TBK1_Untreated 0.878094 0.333827 0.949547 0.464850 0.915664
TDP43_Untreated 0.753383 0.240542 0.864097 0.276467 0.840518
WT_Untreated 0.788057 0.542311 0.841166 0.424584 0.894784
Macro Average 0.845461 0.477491 0.908301 0.472401 0.908808
run_baseline_model(
batches=[1, 2, 3, 7, 8, 9, 10],
balance=True,
norm=False,
choose_features=False,
top_k=100,
label_map=None,
classifier_class=cuMLLogisticRegression,
classifier_kwargs={},
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
precision recall f1-score support
0 0.65 0.92 0.76 1222
1 0.87 0.52 0.65 1245
2 0.77 0.93 0.84 1015
3 0.79 0.50 0.61 2314
4 0.21 0.00 0.01 1876
5 0.28 0.31 0.30 1699
6 0.28 0.71 0.40 1561
accuracy 0.50 10932
macro avg 0.55 0.56 0.51 10932
weighted avg 0.53 0.50 0.47 10932
=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4 5 6]
Test: (8356, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
precision recall f1-score support
0 0.62 0.64 0.63 1231
1 0.62 0.57 0.59 1162
2 0.45 0.99 0.62 800
3 0.59 0.62 0.60 1649
4 0.61 0.64 0.62 1220
5 0.37 0.13 0.19 1508
6 0.62 0.55 0.58 786
accuracy 0.56 8356
macro avg 0.55 0.59 0.55 8356
weighted avg 0.55 0.56 0.53 8356
=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4 5 6]
Test: (6931, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
precision recall f1-score support
0 0.73 0.97 0.83 1004
1 0.93 0.57 0.71 800
2 0.69 0.75 0.72 1131
3 0.47 0.34 0.40 1103
4 0.40 0.83 0.53 1045
5 0.49 0.34 0.40 930
6 0.56 0.16 0.25 918
accuracy 0.57 6931
macro avg 0.61 0.56 0.55 6931
weighted avg 0.60 0.57 0.55 6931
=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4 5 6]
Test: (163, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
precision recall f1-score support
0 1.00 0.04 0.07 26
1 0.00 0.00 0.00 25
2 0.00 0.00 0.00 44
3 0.07 0.80 0.12 5
4 0.50 0.08 0.13 13
5 0.07 0.23 0.11 13
6 0.60 0.97 0.74 37
accuracy 0.28 163
macro avg 0.32 0.30 0.17 163
weighted avg 0.34 0.28 0.20 163
=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9550, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
precision recall f1-score support
0 0.81 0.21 0.34 1567
1 0.55 0.94 0.69 1562
2 0.61 0.50 0.55 1163
3 0.43 0.22 0.29 1429
4 0.69 0.16 0.26 755
5 0.30 0.65 0.41 1564
6 0.43 0.35 0.39 1510
accuracy 0.46 9550
macro avg 0.55 0.43 0.42 9550
weighted avg 0.53 0.46 0.43 9550
=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9193, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
precision recall f1-score support
0 0.20 0.32 0.24 963
1 0.27 0.51 0.36 619
2 0.80 0.14 0.24 1298
3 0.18 0.40 0.24 1586
4 0.35 0.29 0.31 984
5 0.10 0.09 0.09 1439
6 0.94 0.27 0.42 2304
accuracy 0.27 9193
macro avg 0.41 0.29 0.27 9193
weighted avg 0.47 0.27 0.28 9193
=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2 3 4 5 6]
Test: (7276, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
precision recall f1-score support
0 0.59 0.19 0.29 267
1 0.75 0.95 0.84 666
2 0.04 0.89 0.08 45
3 0.96 0.03 0.05 1763
4 0.28 0.46 0.35 188
5 0.10 0.01 0.01 2151
6 0.43 0.98 0.60 2196
accuracy 0.41 7276
macro avg 0.45 0.50 0.32 7276
weighted avg 0.49 0.41 0.29 7276
=== Overall Accuracy ===
0.43583060784180494 [0.5038419319429198, 0.5594782192436573, 0.5730774779974029, 0.27607361963190186, 0.4568586387434555, 0.2668334602414881, 0.4146509070918087]
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
FUSHeterozygous_Untreated 0.895670 0.569268 0.940114 0.564147 0.941277
FUSHomozygous_Untreated 0.910078 0.688436 0.939165 0.597601 0.958280
FUSRevertant_Untreated 0.903647 0.618268 0.937086 0.535202 0.954443
OPTN_Untreated 0.787828 0.359326 0.887009 0.423985 0.856767
TBK1_Untreated 0.873991 0.351751 0.942552 0.445625 0.917187
TDP43_Untreated 0.756512 0.235490 0.868993 0.279571 0.840387
WT_Untreated 0.789546 0.537586 0.843997 0.426842 0.894131
Macro Average 0.845325 0.480018 0.908417 0.467568 0.908925
run_baseline_model(
batches=[1, 2, 3, 7, 8, 9, 10],
balance=True,
norm=False,
choose_features=True,
top_k=10,
label_map=None,
classifier_class=cuMLLogisticRegression,
classifier_kwargs={},
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
Selecting top 10 features...
precision recall f1-score support
0 0.44 0.59 0.50 1222
1 0.38 0.23 0.29 1245
2 0.16 0.12 0.14 1015
3 0.20 0.07 0.11 2314
4 0.22 0.09 0.13 1876
5 0.26 0.05 0.08 1699
6 0.11 0.41 0.17 1561
accuracy 0.20 10932
macro avg 0.25 0.22 0.20 10932
weighted avg 0.24 0.20 0.18 10932
=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4 5 6]
Test: (8356, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
Selecting top 10 features...
precision recall f1-score support
0 0.51 0.17 0.26 1231
1 0.48 0.72 0.58 1162
2 0.14 0.32 0.19 800
3 0.48 0.20 0.29 1649
4 0.17 0.15 0.16 1220
5 0.29 0.27 0.28 1508
6 0.16 0.25 0.20 786
accuracy 0.29 8356
macro avg 0.32 0.30 0.28 8356
weighted avg 0.34 0.29 0.28 8356
=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4 5 6]
Test: (6931, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
Selecting top 10 features...
precision recall f1-score support
0 0.55 0.27 0.36 1004
1 0.46 0.70 0.56 800
2 0.17 0.15 0.16 1131
3 0.16 0.06 0.09 1103
4 0.20 0.21 0.20 1045
5 0.18 0.23 0.20 930
6 0.18 0.28 0.22 918
accuracy 0.25 6931
macro avg 0.27 0.27 0.26 6931
weighted avg 0.26 0.25 0.24 6931
=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4 5 6]
Test: (163, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
Selecting top 10 features...
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
precision recall f1-score support
0 0.25 0.12 0.16 26
1 0.14 0.76 0.24 25
2 0.00 0.00 0.00 44
3 0.00 0.00 0.00 5
4 0.00 0.00 0.00 13
5 0.00 0.00 0.00 13
6 0.00 0.00 0.00 37
accuracy 0.13 163
macro avg 0.06 0.13 0.06 163
weighted avg 0.06 0.13 0.06 163
=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9550, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
Selecting top 10 features...
precision recall f1-score support
0 0.49 0.51 0.50 1567
1 0.48 0.44 0.46 1562
2 0.16 0.25 0.20 1163
3 0.29 0.15 0.20 1429
4 0.14 0.06 0.08 755
5 0.21 0.27 0.24 1564
6 0.24 0.26 0.25 1510
accuracy 0.30 9550
macro avg 0.29 0.28 0.27 9550
weighted avg 0.30 0.30 0.29 9550
=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9193, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
Selecting top 10 features...
precision recall f1-score support
0 0.28 0.96 0.43 963
1 0.26 0.14 0.18 619
2 0.41 0.13 0.19 1298
3 0.00 0.00 0.00 1586
4 0.00 0.00 0.00 984
5 0.00 0.00 0.00 1439
6 0.12 0.02 0.04 2304
accuracy 0.13 9193
macro avg 0.15 0.18 0.12 9193
weighted avg 0.13 0.13 0.09 9193
=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2 3 4 5 6]
Test: (7276, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
Selecting top 10 features...
precision recall f1-score support
0 0.13 0.12 0.12 267
1 0.67 0.67 0.67 666
2 0.01 0.40 0.02 45
3 0.19 0.05 0.08 1763
4 0.03 0.10 0.05 188
5 0.34 0.30 0.32 2151
6 0.41 0.36 0.39 2196
accuracy 0.28 7276
macro avg 0.25 0.29 0.24 7276
weighted avg 0.34 0.28 0.30 7276
=== Overall Accuracy ===
0.22780189082915434 [0.20087815587266739, 0.2892532312111058, 0.2547972875486943, 0.13496932515337423, 0.2978010471204188, 0.13434134667681932, 0.28257284222100054]
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
FUSHeterozygous_Untreated 0.844564 0.470860 0.895449 0.380126 0.925530
FUSHomozygous_Untreated 0.875747 0.481987 0.927421 0.465671 0.931705
FUSRevertant_Untreated 0.794660 0.186317 0.865942 0.140044 0.900818
OPTN_Untreated 0.702601 0.090161 0.844355 0.118227 0.800379
TBK1_Untreated 0.829011 0.103766 0.924223 0.152379 0.887070
TDP43_Untreated 0.757695 0.191208 0.879992 0.255934 0.834433
WT_Untreated 0.674186 0.249893 0.765880 0.187435 0.825314
Macro Average 0.782638 0.253456 0.871894 0.242831 0.872179
## Baseline
run_baseline_model(
batches=[1, 2, 3, 7, 8, 9, 10],
balance=False,
norm=False,
choose_features=True,
top_k=1000,
label_map=None,
apply_pca = True,
classifier_class=cuMLLogisticRegression,
classifier_kwargs={},
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
Selecting top 1000 features...
precision recall f1-score support
0 0.57 0.89 0.70 1222
1 0.77 0.35 0.48 1245
2 0.55 0.67 0.61 1015
3 0.52 0.49 0.50 2314
4 0.06 0.01 0.01 1876
5 0.28 0.17 0.21 1699
6 0.21 0.52 0.30 1561
accuracy 0.41 10932
macro avg 0.42 0.44 0.40 10932
weighted avg 0.40 0.41 0.37 10932
=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4 5 6]
Test: (8356, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
Selecting top 1000 features...
precision recall f1-score support
0 0.53 0.48 0.50 1231
1 0.49 0.52 0.51 1162
2 0.33 0.72 0.45 800
3 0.38 0.49 0.43 1649
4 0.40 0.41 0.41 1220
5 0.33 0.06 0.10 1508
6 0.40 0.31 0.35 786
accuracy 0.41 8356
macro avg 0.41 0.43 0.39 8356
weighted avg 0.41 0.41 0.38 8356
=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4 5 6]
Test: (6931, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
Selecting top 1000 features...
precision recall f1-score support
0 0.67 0.84 0.75 1004
1 0.73 0.49 0.58 800
2 0.46 0.50 0.48 1131
3 0.29 0.34 0.32 1103
4 0.29 0.48 0.36 1045
5 0.45 0.36 0.40 930
6 0.24 0.03 0.06 918
accuracy 0.44 6931
macro avg 0.45 0.43 0.42 6931
weighted avg 0.44 0.44 0.42 6931
=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4 5 6]
Test: (163, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
Selecting top 1000 features...
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
precision recall f1-score support
0 1.00 0.08 0.14 26
1 1.00 0.12 0.21 25
2 0.00 0.00 0.00 44
3 0.04 1.00 0.08 5
4 0.00 0.00 0.00 13
5 0.05 0.08 0.06 13
6 0.89 0.46 0.61 37
accuracy 0.17 163
macro avg 0.43 0.25 0.16 163
weighted avg 0.52 0.17 0.20 163
=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9550, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
Selecting top 1000 features...
precision recall f1-score support
0 0.67 0.04 0.07 1567
1 0.50 0.98 0.66 1562
2 0.38 0.15 0.22 1163
3 0.40 0.23 0.29 1429
4 0.34 0.26 0.29 755
5 0.25 0.48 0.33 1564
6 0.32 0.33 0.32 1510
accuracy 0.37 9550
macro avg 0.41 0.35 0.31 9550
weighted avg 0.42 0.37 0.32 9550
=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9193, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
Selecting top 1000 features...
precision recall f1-score support
0 0.18 0.33 0.23 963
1 0.31 0.69 0.43 619
2 0.22 0.03 0.05 1298
3 0.02 0.06 0.03 1586
4 0.16 0.03 0.05 984
5 0.07 0.06 0.07 1439
6 0.73 0.12 0.21 2304
accuracy 0.14 9193
macro avg 0.24 0.19 0.15 9193
weighted avg 0.29 0.14 0.13 9193
=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2 3 4 5 6]
Test: (7276, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
Selecting top 1000 features...
precision recall f1-score support
0 0.54 0.76 0.63 267
1 0.89 0.74 0.81 666
2 0.04 0.93 0.08 45
3 0.97 0.03 0.06 1763
4 0.09 0.05 0.07 188
5 0.06 0.00 0.01 2151
6 0.41 0.97 0.58 2196
accuracy 0.40 7276
macro avg 0.43 0.50 0.32 7276
weighted avg 0.48 0.40 0.29 7276
=== Overall Accuracy ===
0.3338935016889238 [0.4060556165386023, 0.4088080421254189, 0.43904198528350885, 0.17177914110429449, 0.37036649214659684, 0.13782225606439683, 0.40338097855964816]
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
FUSHeterozygous_Untreated 0.873304 0.493790 0.924980 0.472641 0.930650
FUSHomozygous_Untreated 0.892502 0.638592 0.925824 0.530473 0.951268
FUSRevertant_Untreated 0.863991 0.377365 0.921011 0.358886 0.926601
OPTN_Untreated 0.714032 0.284699 0.813405 0.260983 0.830881
TBK1_Untreated 0.854697 0.205723 0.939896 0.310037 0.900136
TDP43_Untreated 0.761875 0.166595 0.890387 0.247051 0.831899
WT_Untreated 0.751989 0.429875 0.821602 0.342429 0.869593
Macro Average 0.816056 0.370948 0.891015 0.360357 0.891575
run_baseline_model(
batches=[1, 2, 3, 7, 8, 9, 10],
balance=False,
norm=False,
choose_features=True,
top_k=5,
label_map=None,
classifier_class=cuMLLogisticRegression,
classifier_kwargs={},
)
=== Batch 1 === Train: (41469, 5568) Labels: [0 1 2 3 4 5 6] Test: (10932, 5568) Labels: [0 1 2 3 4 5 6] FUSHeterozygous_Untreated: 5058 FUSHomozygous_Untreated: 4834 FUSRevertant_Untreated: 4481 OPTN_Untreated: 7535 TBK1_Untreated: 4205 TDP43_Untreated: 7605 WT_Untreated: 7751 Selecting top 5 features...
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
precision recall f1-score support
0 0.44 0.55 0.49 1222
1 0.42 0.29 0.34 1245
2 0.00 0.00 0.00 1015
3 0.26 0.15 0.19 2314
4 0.07 0.00 0.00 1876
5 0.21 0.33 0.25 1699
6 0.14 0.40 0.21 1561
accuracy 0.24 10932
macro avg 0.22 0.25 0.21 10932
weighted avg 0.22 0.24 0.20 10932
=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4 5 6]
Test: (8356, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
Selecting top 5 features...
precision recall f1-score support
0 0.41 0.06 0.10 1231
1 0.47 0.74 0.58 1162
2 0.00 0.00 0.00 800
3 0.29 0.21 0.25 1649
4 0.18 0.01 0.02 1220
5 0.26 0.36 0.30 1508
6 0.13 0.49 0.21 786
accuracy 0.27 8356
macro avg 0.25 0.27 0.21 8356
weighted avg 0.27 0.27 0.22 8356
=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4 5 6]
Test: (6931, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
Selecting top 5 features...
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
precision recall f1-score support
0 0.49 0.15 0.23 1004
1 0.45 0.76 0.56 800
2 0.00 0.00 0.00 1131
3 0.17 0.15 0.16 1103
4 0.07 0.00 0.00 1045
5 0.18 0.53 0.27 930
6 0.21 0.36 0.26 918
accuracy 0.25 6931
macro avg 0.22 0.28 0.21 6931
weighted avg 0.21 0.25 0.19 6931
=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4 5 6]
Test: (163, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
Selecting top 5 features...
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
precision recall f1-score support
0 0.00 0.00 0.00 26
1 0.13 0.72 0.23 25
2 0.00 0.00 0.00 44
3 0.00 0.00 0.00 5
4 0.00 0.00 0.00 13
5 0.00 0.00 0.00 13
6 0.00 0.00 0.00 37
accuracy 0.11 163
macro avg 0.02 0.10 0.03 163
weighted avg 0.02 0.11 0.03 163
=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9550, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
Selecting top 5 features...
precision recall f1-score support
0 0.47 0.59 0.52 1567
1 0.41 0.26 0.32 1562
2 0.00 0.00 0.00 1163
3 0.24 0.36 0.29 1429
4 0.03 0.00 0.00 755
5 0.22 0.30 0.25 1564
6 0.23 0.33 0.27 1510
accuracy 0.30 9550
macro avg 0.23 0.26 0.24 9550
weighted avg 0.25 0.30 0.27 9550
=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9193, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
Selecting top 5 features...
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
precision recall f1-score support
0 0.35 0.97 0.51 963
1 0.16 0.13 0.15 619
2 0.19 0.01 0.01 1298
3 0.01 0.04 0.02 1586
4 0.00 0.00 0.00 984
5 0.00 0.00 0.00 1439
6 0.15 0.01 0.01 2304
accuracy 0.12 9193
macro avg 0.12 0.17 0.10 9193
weighted avg 0.11 0.12 0.07 9193
=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2 3 4 5 6]
Test: (7276, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
Selecting top 5 features...
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
precision recall f1-score support
0 0.23 0.46 0.31 267
1 0.66 0.35 0.46 666
2 0.00 0.00 0.00 45
3 0.26 0.30 0.28 1763
4 0.00 0.00 0.00 188
5 0.36 0.39 0.38 2151
6 0.41 0.38 0.40 2196
accuracy 0.35 7276
macro avg 0.28 0.27 0.26 7276
weighted avg 0.36 0.35 0.35 7276
=== Overall Accuracy ===
0.2329581302927753 [0.23609586534943286, 0.26627573001436095, 0.25046890780551145, 0.11042944785276074, 0.2959162303664922, 0.1200913738714239, 0.35142935678944476]
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
FUSHeterozygous_Untreated 0.852274 0.458599 0.905878 0.398837 0.924745
FUSHomozygous_Untreated 0.866758 0.424247 0.924831 0.425507 0.924471
FUSRevertant_Untreated 0.894544 0.001456 0.999190 0.173913 0.895177
OPTN_Untreated 0.636057 0.200426 0.736887 0.149886 0.799266
TBK1_Untreated 0.882006 0.002138 0.997517 0.101562 0.883917
TDP43_Untreated 0.696342 0.311801 0.779358 0.233763 0.839889
WT_Untreated 0.669987 0.289089 0.752303 0.201422 0.830413
Macro Average 0.785424 0.241108 0.870852 0.240699 0.871126
run_baseline_model(
batches=[1, 2, 3, 7, 8, 9, 10],
balance=False,
norm=False,
choose_features=True,
top_k=20,
label_map=None,
classifier_class=cuMLLogisticRegression,
classifier_kwargs={},
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
Selecting top 20 features...
precision recall f1-score support
0 0.47 0.73 0.57 1222
1 0.41 0.16 0.23 1245
2 0.19 0.03 0.05 1015
3 0.17 0.20 0.18 2314
4 0.08 0.00 0.00 1876
5 0.26 0.10 0.14 1699
6 0.10 0.31 0.15 1561
accuracy 0.20 10932
macro avg 0.24 0.22 0.19 10932
weighted avg 0.22 0.20 0.18 10932
=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4 5 6]
Test: (8356, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
Selecting top 20 features...
precision recall f1-score support
0 0.45 0.18 0.25 1231
1 0.46 0.67 0.54 1162
2 0.23 0.05 0.09 800
3 0.40 0.33 0.37 1649
4 0.16 0.02 0.03 1220
5 0.30 0.36 0.33 1508
6 0.15 0.51 0.23 786
accuracy 0.31 8356
macro avg 0.31 0.30 0.26 8356
weighted avg 0.32 0.31 0.28 8356
=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4 5 6]
Test: (6931, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
Selecting top 20 features...
precision recall f1-score support
0 0.58 0.36 0.45 1004
1 0.48 0.66 0.55 800
2 0.32 0.01 0.02 1131
3 0.18 0.11 0.14 1103
4 0.10 0.01 0.01 1045
5 0.18 0.39 0.24 930
6 0.14 0.36 0.20 918
accuracy 0.25 6931
macro avg 0.28 0.27 0.23 6931
weighted avg 0.28 0.25 0.21 6931
=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4 5 6]
Test: (163, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
Selecting top 20 features...
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
precision recall f1-score support
0 0.50 0.15 0.24 26
1 0.13 0.72 0.23 25
2 0.00 0.00 0.00 44
3 0.05 0.20 0.08 5
4 0.00 0.00 0.00 13
5 0.00 0.00 0.00 13
6 0.00 0.00 0.00 37
accuracy 0.14 163
macro avg 0.10 0.15 0.08 163
weighted avg 0.10 0.14 0.07 163
=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9550, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
Selecting top 20 features...
precision recall f1-score support
0 0.49 0.51 0.50 1567
1 0.49 0.45 0.47 1562
2 0.11 0.02 0.03 1163
3 0.29 0.27 0.28 1429
4 0.03 0.00 0.00 755
5 0.21 0.36 0.27 1564
6 0.20 0.30 0.24 1510
accuracy 0.31 9550
macro avg 0.26 0.27 0.26 9550
weighted avg 0.29 0.31 0.29 9550
=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9193, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
Selecting top 20 features...
precision recall f1-score support
0 0.35 0.96 0.51 963
1 0.14 0.17 0.16 619
2 0.28 0.01 0.02 1298
3 0.00 0.00 0.00 1586
4 0.00 0.00 0.00 984
5 0.00 0.00 0.00 1439
6 0.13 0.01 0.01 2304
accuracy 0.12 9193
macro avg 0.13 0.16 0.10 9193
weighted avg 0.12 0.12 0.07 9193
=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2 3 4 5 6]
Test: (7276, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
Selecting top 20 features...
precision recall f1-score support
0 0.17 0.24 0.20 267
1 0.64 0.52 0.58 666
2 0.02 0.31 0.04 45
3 0.15 0.03 0.05 1763
4 0.03 0.03 0.03 188
5 0.33 0.35 0.34 2151
6 0.36 0.46 0.40 2196
accuracy 0.31 7276
macro avg 0.24 0.28 0.23 7276
weighted avg 0.31 0.31 0.29 7276
=== Overall Accuracy ===
0.23270411625965645 [0.2039882912550311, 0.306247008137865, 0.2481604386091473, 0.1411042944785276, 0.30607329842931935, 0.11563145871859024, 0.3077240241891149]
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
FUSHeterozygous_Untreated 0.858934 0.518471 0.905293 0.427072 0.932465
FUSHomozygous_Untreated 0.868399 0.441026 0.924485 0.433889 0.926485
FUSRevertant_Untreated 0.875231 0.023472 0.975035 0.099231 0.894973
OPTN_Untreated 0.649606 0.160524 0.762808 0.135429 0.796990
TBK1_Untreated 0.876090 0.006085 0.990307 0.076132 0.883579
TDP43_Untreated 0.726169 0.255804 0.827714 0.242733 0.837450
WT_Untreated 0.632469 0.289197 0.706654 0.175634 0.821436
Macro Average 0.783843 0.242083 0.870328 0.227160 0.870483